home *** CD-ROM | disk | FTP | other *** search
/ PC World 2002 September / PCWorld_2002-09_cd.bin / Software / Vyzkuste / httrack / httrack-3.20RC4.exe / {app} / src / htsparse.c < prev    next >
C/C++ Source or Header  |  2002-07-09  |  110KB  |  2,355 lines

  1. /* ------------------------------------------------------------ */
  2. /*
  3. HTTrack Website Copier, Offline Browser for Windows and Unix
  4. Copyright (C) Xavier Roche and other contributors
  5.  
  6. This program is free software; you can redistribute it and/or
  7. modify it under the terms of the GNU General Public License
  8. as published by the Free Software Foundation; either version 2
  9. of the License, or any later version.
  10.  
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14. GNU General Public License for more details.
  15.  
  16. You should have received a copy of the GNU General Public License
  17. along with this program; if not, write to the Free Software
  18. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19.  
  20.  
  21. Important notes:
  22.  
  23. - We hereby ask people using this source NOT to use it in purpose of grabbing
  24. emails addresses, or collecting any other private information on persons.
  25. This would disgrace our work, and spoil the many hours we spent on it.
  26.  
  27.  
  28. Please visit our Website: http://www.httrack.com
  29. */
  30.  
  31.  
  32. /* ------------------------------------------------------------ */
  33. /* File: Main source                                            */
  34. /* DIRECT INCLUDE TO httrack.c                                  */
  35. /* Author: Xavier Roche                                         */
  36. /* ------------------------------------------------------------ */
  37.  
  38.  
  39. #if HTS_ANALYSTE
  40. if (hts_htmlcheck(r.adr,(int)r.size,urladr,urlfil)) {
  41. #endif          
  42.   FILE* fp=NULL;      // fichier Θcrit localement                                               // et si level>0
  43.   char* adr=r.adr;    // pointeur (on parcourt)
  44.   char* lastsaved;    // adresse du dernier octet sauvΘ + 1
  45.   if ( (opt.debug>1) && (opt.log!=NULL) ) {
  46.     fspc(opt.log,"debug"); fprintf(opt.log,"scan file.."LF); test_flush;
  47.   }
  48.  
  49.  
  50.   // Indexing!
  51. #if HTS_MAKE_KEYWORD_INDEX
  52.   if (opt.kindex) {
  53.     if (index_keyword(r.adr,r.size,r.contenttype,savename,opt.path_html)) {
  54.       if ( (opt.debug>1) && (opt.log!=NULL) ) {
  55.         fspc(opt.log,"debug"); fprintf(opt.log,"indexing file..done"LF); test_flush;
  56.       }
  57.     } else {
  58.       if ( (opt.debug>1) && (opt.log!=NULL) ) {
  59.         fspc(opt.log,"debug"); fprintf(opt.log,"indexing file..error!"LF); test_flush;
  60.       }
  61.     }
  62.   }
  63. #endif
  64.  
  65.   // Now, parsing
  66.   if ((opt.getmode & 1) && (ptr>0)) {  // rΘcupΘrer les html sur disque       
  67.     // crΘer le fichier html local
  68.     HT_ADD_FOP;   // Θcrire peu α peu le fichier
  69.   }
  70.   
  71.   if (!error) {
  72.     int detect_title=0;  // dΘtection  du title
  73.     //
  74.     char* in_media=NULL; // in other media type (real media and so..)
  75.     int intag=0;         // on est dans un tag
  76.     int incomment=0;     // dans un <!--
  77.     int inscript=0;      // dans un scipt pour applets javascript)
  78.     int inscript_tag=0;  // on est dans un <body onLoad="... terminΘ par >
  79.     char inscript_tag_lastc='\0';  
  80.                            // terminaison (" ou ') du "<body onLoad=.."
  81.     int inscriptgen=0;     // on est dans un code gΘnΘrant, ex aprΦs obj.write("..
  82.     char scriptgen_q='\0'; // caractΦre faisant office de guillemet (' ou ")
  83.     int no_esc_utf=0;      // ne pas echapper chars > 127
  84.     int nofollow=0;        // ne pas scanner
  85.     //
  86.     int parseall_lastc='\0';    // dernier caractΦre parsΘ pour parseall
  87.     int parseall_incomment=0;   // dans un /* */ (exemple: a = /* URL */ "img.gif";)
  88.     //
  89.     char* intag_start=adr;
  90.     char* intag_startattr=NULL;
  91.     int intag_start_valid=0;
  92.     HT_ADD_START;    // dΘbuter
  93.  
  94.  
  95.     /* statistics */
  96.     if ((opt.getmode & 1) && (ptr>0)) { 
  97.       /*
  98.       HTS_STAT.stat_files++;
  99.       HTS_STAT.stat_bytes+=r.size;
  100.       */
  101.     }
  102.  
  103.     /* Primary list or URLs */
  104.     if (ptr == 0) {
  105.       intag=1;
  106.       intag_start_valid=0;
  107.     }
  108.     /* Check is the file is a .js file */
  109.     else if (
  110.       (strfield2(r.contenttype,"application/x-javascript")!=0)
  111.       || (strfield2(r.contenttype,"text/css")!=0)
  112.       ) {      /* JavaScript js file */
  113.       inscript=1;
  114.       intag=1;     // because aprΦs <script> on y est .. - pas utile
  115.       intag_start_valid=0;    // OUI car nous sommes dans du code, plus dans du "vrai" tag
  116.       if ((opt.debug>1) && (opt.log!=NULL)) {
  117.         fspc(opt.log,"debug"); fprintf(opt.log,"note: this file is a javascript file"LF); test_flush;
  118.       }
  119.     }
  120.     /* Or a real audio */
  121.     else if (strfield2(r.contenttype,"audio/x-pn-realaudio")!=0) {      /* realaudio link file */
  122.       inscript=intag=1;
  123.       intag_start_valid=0;
  124.       in_media="RAM";       // real media!
  125.     }
  126.     // Detect UTF8 format
  127.     if (is_unicode_utf8((unsigned char*) r.adr, (unsigned int) r.size) == 1) {
  128.       no_esc_utf=1;
  129.     } else {
  130.       no_esc_utf=0;
  131.     }
  132.     // Hack to prevent any problems with ram files of other files
  133.     * ( r.adr + r.size ) = '\0';
  134.  
  135.  
  136.     // ------------------------------------------------------------
  137.     // analyser ce qu'il y a en mΘmoire (fichier html)
  138.     // on scanne les balises
  139.     // ------------------------------------------------------------
  140. #if HTS_ANALYSTE
  141.     _hts_in_html_done=0;     // 0% scannΘs
  142.     _hts_cancel=0;           // pas de cancel
  143.     _hts_in_html_parsing=1;  // flag pour indiquer un parsing
  144. #endif
  145.     base[0]='\0';    // effacer base-href
  146.     lastsaved=adr;
  147.     do {
  148.       int p=0;
  149.       int valid_p=0;      // force to take p even if == 0
  150.       int ending_p='\0';  // ending quote?
  151.       error=0;
  152.  
  153.       /* Hack to avoid NULL char problems with C syntax */
  154.       /* Yes, some bogus HTML pages can embed null chars
  155.          and therefore can not be properly handled if this hack is not done
  156.       */
  157.       if ( ! (*adr) ) {
  158.         if ( ((int) (adr - r.adr)) < r.size)
  159.           *adr=' ';
  160.       }
  161.  
  162.  
  163.  
  164.       /*
  165.       index.html built here
  166.       */
  167.       // Construction index.html (sommaire)
  168.       // Avant de tester les a href,
  169.       // Ici on teste si l'on doit construire l'index vers le(s) site(s) miroir(s)
  170.       if (!makeindex_done) {  // autoriation d'Θcrire un index
  171.         if (!detect_title) {
  172.           if (opt.depth == liens[ptr]->depth) {    // on note toujours les premiers liens
  173.             if (!in_media) {
  174.               if (opt.makeindex && (ptr>0)) {
  175.                 if (opt.getmode & 1) {  // autorisation d'Θcrire
  176.                   p=strfield(adr,"title");  
  177.                   if (p) {
  178.                     if (*(adr-1)=='/') p=0;    // /title
  179.                   } else {
  180.                     if (strfield(adr,"/html"))
  181.                       p=-1;                    // noter, mais sans titre
  182.                     else if (strfield(adr,"body"))
  183.                       p=-1;                    // noter, mais sans titre
  184.                     else if ( ((int) (adr - r.adr) ) >= (r.size-1) )
  185.                       p=-1;                    // noter, mais sans titre
  186.                     else if ( (int) (adr - r.adr) >= r.size - 2)   // we got to hurry
  187.                       p=-1; // xxc xxc xxc
  188.                   }
  189.                 } else
  190.                   p=0;
  191.                 
  192.                 if (p) {    // ok center                            
  193.                   if (makeindex_fp==NULL) {
  194.                     verif_backblue(opt.path_html);    // gΘnΘrer gif
  195.                     makeindex_fp=filecreate(fconcat(opt.path_html,"index.html"));
  196.                     if (makeindex_fp!=NULL) {
  197.  
  198.                       // Header
  199.                       fprintf(makeindex_fp,template_header,
  200.                         "<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"
  201.                         );
  202.  
  203.                     } else makeindex_done=-1;    // fait, erreur
  204.                   }
  205.                   
  206.                   if (makeindex_fp!=NULL) {
  207.                     char tempo[HTS_URLMAXSIZE*2];
  208.                     char s[HTS_URLMAXSIZE*2];
  209.                     char* a=NULL;
  210.                     char* b=NULL;
  211.                     s[0]='\0';
  212.                     if (p>0) {
  213.                       a=strchr(adr,'>');
  214.                       if (a!=NULL) {
  215.                         a++;
  216.                         while(is_space(*a)) a++;    // sauter espaces & co
  217.                         b=strchr(a,'<');   // prochain tag
  218.                       }
  219.                     }
  220.                     if (lienrelatif(tempo,liens[ptr]->sav,concat(opt.path_html,"index.html"))==0) {
  221.                       detect_title=1;      // ok dΘtectΘ pour cette page!
  222.                       makeindex_links++;   // un de plus
  223.                       strcpy(makeindex_firstlink,tempo);
  224.                       //
  225.                       if ((b==a) || (a==NULL) || (b==NULL)) {    // pas de titre
  226.                         strcpy(s,tempo);
  227.                       } else if ((b-a)<256) {
  228.                         b--;
  229.                         while(is_space(*b)) b--;
  230.                         strncpy(s,a,b-a+1);
  231.                         *(s+(b-a)+1)='\0';
  232.                       }
  233.  
  234.                       // Body
  235.                       fprintf(makeindex_fp,template_body,
  236.                         tempo,
  237.                         s
  238.                         );
  239.  
  240.                     }
  241.                   }
  242.                 }
  243.               }
  244.             }
  245.             
  246.           } else if (liens[ptr]->depth<opt.depth) {   // on a sautΘ level1+1 et level1
  247.             HT_INDEX_END;
  248.           }
  249.         } // if (opt.makeindex)
  250.       }
  251.       // FIN Construction index.html (sommaire)
  252.       /*
  253.       end -- index.html built here
  254.       */
  255.       
  256.  
  257.  
  258.       /* Parse */
  259.       if (
  260.            (*adr=='<')    /* No starting tag */
  261.         && (!inscript)    /* Not in (java)script */
  262.         && (!incomment)   /* Not in comment (<!--) */
  263.       ) { 
  264.         intag=1;
  265.         parseall_incomment=0;
  266.         //inquote=0;  // effacer quote
  267.         intag_start=adr; intag_start_valid=1;
  268.         codebase[0]='\0';    // effacer Θventuel codebase
  269.         
  270.         if (opt.getmode & 1) {  // sauver html
  271.           p=strfield(adr,"</html");
  272.           if (p==0) p=strfield(adr,"<head>");
  273.           // if (p==0) p=strfield(adr,"<doctype");
  274.           if (p) {
  275.             if (strnotempty(opt.footer)) {
  276.               char tempo[1024+HTS_URLMAXSIZE*2];
  277.               char gmttime[256];
  278.               char* eol="\n";
  279.               tempo[0]='\0';
  280.               if (strchr(r.adr,'\r'))
  281.                 eol="\r\n";
  282.               time_gmt_rfc822(gmttime);
  283.               strcat(tempo,eol);
  284.               sprintf(tempo+strlen(tempo),opt.footer,jump_identification(urladr),urlfil,gmttime,"","","","","","","","");
  285.               strcat(tempo,eol);
  286.               //fwrite(tempo,1,strlen(tempo),fp);
  287.               HT_ADD(tempo);
  288.             }
  289.           }
  290.         }        
  291.         
  292.         // Θliminer les <!-- (commentaires) : intag dΘvalidΘ
  293.         if (*(adr+1)=='!')
  294.           if (*(adr+2)=='-')
  295.             if (*(adr+3)=='-') {
  296.               intag=0;
  297.               incomment=1;
  298.               intag_start_valid=0;
  299.             }
  300.             
  301.       }
  302.       else if (
  303.            (*adr=='>')                        /* ending tag */
  304.         && ( (!inscript) || (inscript_tag) )  /* and in tag (or in script) */
  305.       ) {
  306.         if (inscript_tag) {
  307.           inscript_tag=inscript=0;
  308.           intag=0;
  309.           incomment=0;
  310.           intag_start_valid=0;
  311.         } else if (!incomment) {
  312.           intag=0; //inquote=0;
  313.           
  314.           // entrΘe dans du javascript?
  315.           // on parse ICI car il se peut qu'on ait eu a parser les src=.. dedans
  316.           //if (!inscript) {  // sinon on est dans un obj.write("..
  317.           if ((intag_start_valid) && 
  318.             (
  319.             check_tag(intag_start,"script")
  320.             ||
  321.             check_tag(intag_start,"style")
  322.             )
  323.             ) {
  324.             char* a=intag_start;    // <
  325.             // ** while(is_realspace(*(--a)));
  326.             if (*a=='<') {  // s√r que c'est un tag?
  327.               inscript=1;
  328.               intag=1;     // because aprΦs <script> on y est .. - pas utile
  329.               intag_start_valid=0;    // OUI car nous sommes dans du code, plus dans du "vrai" tag
  330.             }
  331.           }
  332.         } else {                               /* end of comment? */
  333.           // vΘrifier fermeture correcte
  334.           if ( (*(adr-1)=='-') && (*(adr-2)=='-') ) {
  335.             intag=0;
  336.             incomment=0;
  337.             intag_start_valid=0;
  338.           }
  339. #if GT_ENDS_COMMENT
  340.           /* wrong comment ending */
  341.           else {
  342.             /* check if correct ending does not exists
  343.                <!-- foo > example <!-- bar > is sometimes accepted by browsers
  344.                when no --> is used somewhere else.. darn those browsers are dirty
  345.             */
  346.             if (!strstr(adr,"-->")) {
  347.               intag=0;
  348.               incomment=0;
  349.               intag_start_valid=0;
  350.             }
  351.           }
  352. #endif
  353.         }
  354.         //}
  355.       }
  356.       //else if (*adr==34) {
  357.       //  inquote=(inquote?0:1);
  358.       //}
  359.       else if (intag || inscript) {    // nous sommes dans un tag/commentaire, tester si on recoit un tag
  360.         int p_type=0;
  361.         int p_nocatch=0;
  362.         int p_searchMETAURL=0;  // chercher ..URL=<url>
  363.         int add_class=0;        // ajouter .class
  364.         int add_class_dots_to_patch=0;   // number of '.' in code="x.y.z<realname>"
  365.         char* p_flush=NULL;
  366.         
  367.         
  368.         // ------------------------------------------------------------
  369.         // parsing ΘvolΘ
  370.         // ------------------------------------------------------------
  371.         if (((isalpha((unsigned char)*adr)) || (*adr=='/') || (inscript) || (inscriptgen))) {  // sinon pas la peine de tester..
  372.  
  373.  
  374.           /* caractΦre de terminaison pour "miniparsing" javascript=.. ? 
  375.              (ex: <a href="javascript:()" action="foo"> ) */
  376.           if (inscript_tag) {
  377.             if (inscript_tag_lastc) {
  378.               if (*adr == inscript_tag_lastc) {
  379.                 /* sortir */
  380.                 inscript_tag=inscript=0;
  381.                 incomment=0;
  382.               }
  383.             }
  384.           }
  385.           
  386.           
  387.           // Note:
  388.           // Certaines pages ne respectent pas le html
  389.           // notamment les guillements ne sont pas fixΘs
  390.           // Nous sommes dans un tag, donc on peut faire un test plus
  391.           // large pour pouvoi prendre en compte ces particularitΘs
  392.           
  393.           // α vΘrifier: ACTION, CODEBASE, VRML
  394.           
  395.           if (in_media) {
  396.             if (strcmp(in_media,"RAM")==0) { // real media
  397.               p=0;
  398.               valid_p=1;
  399.             }
  400.           } else if (ptr>0) {        /* pas premiΦre page 0 (primary) */
  401.             p=0;  // saut pour le nom de fichier: adresse nom fichier=adr+p
  402.             
  403.             // ------------------------------
  404.             // dΘtection d'Θcriture JavaScript.
  405.             // osons les obj.write et les obj.href=.. ! osons!
  406.             // note: inscript==1 donc on sautera aprΦs les \"
  407.             if (inscript) {
  408.               if (inscriptgen) {          // on est dΘja dans un objet gΘnΘrant..
  409.                 if (*adr==scriptgen_q) {  // fermeture des " ou '
  410.                   if (*(adr-1)!='\\') {   // non
  411.                     inscriptgen=0;        // ok parsing terminΘ
  412.                   }
  413.                 }
  414.               } else {
  415.                 char* a=NULL;
  416.                 char check_this_fking_line=0;  // parsing code javascript..
  417.                 char must_be_terminated=0;     // caractΦre obligatoire de terminaison!
  418.                 int token_size;
  419.                 if (!(token_size=strfield(adr,".writeln"))) // dΘtection ...objet.write[ln]("code html")...
  420.                   token_size=strfield(adr,".write");
  421.                 if (token_size) {
  422.                   a=adr+token_size;
  423.                   while(is_realspace(*a)) a++; // sauter espaces
  424.                   if (*a=='(') {  // dΘbut parenthΦse
  425.                     check_this_fking_line=2;  // α parser!
  426.                     must_be_terminated=')';
  427.                     a++;  // sauter (
  428.                   }
  429.                 }
  430.                 // euhh ??? ???
  431.                 /* else if (strfield(adr,".href")) {  // dΘtection ...objet.href="...
  432.                 a=adr+5;
  433.                 while(is_realspace(*a)) a++; // sauter espaces
  434.                 if (*a=='=') {  // ohh un Θgal
  435.                 check_this_fking_line=1;  // α noter!
  436.                 must_be_terminated=';';   // et si t'as oubliΘ le ; tu sais pas coder
  437.                 a++;   // sauter =
  438.                 }
  439.                 
  440.                 }*/
  441.                 
  442.                 // on a un truc du genre instruction"code gΘnΘrΘ" dont on parse le code
  443.                 if (check_this_fking_line) {
  444.                   while(is_realspace(*a)) a++;
  445.                   if ((*a=='\'') || (*a=='"')) {  // dΘpart de '' ou ""
  446.                     char *b;
  447.                     int ex=0;
  448.                     scriptgen_q=*a;    // quote
  449.                     b=a+1;      // dΘpart de la chaεne
  450.                     // vΘrifier forme ("code") et pas ("code"+var), ingΘrable
  451.                     do {
  452.                       a++;  // caractΦre suivant
  453.                       if (*a==scriptgen_q) if (*(a-1)!='\\')  // quote non slash
  454.                         ex=1;            // sortie
  455.                       if ((*a==10) || (*a==13))
  456.                         ex=1;
  457.                     } while(!ex);
  458.                     if (*a==scriptgen_q) {  // fin du quote
  459.                       a++;
  460.                       while(is_realspace(*a)) a++;
  461.                       if (*a==must_be_terminated) {  // parenthΦse fermante: ("..")
  462.                         
  463.                         // bon, on doit parser une ligne javascript
  464.                         // 1) si check.. ==1 alors c'est un nom de fichier direct, donc
  465.                         // on fixe p sur le saut nΘcessaire pour atteindre le nom du fichier
  466.                         // et le moteur se dΘbrouillera ensuite tout seul comme un grand
  467.                         // 2) si check==2 c'est un peu plus tordu car lα on gΘnΘre du
  468.                         // code html au sein de code javascript au sein de code html
  469.                         // dans ce cas on doit fixer un flag α un puis ensuite dans la boucle
  470.                         // on devra parser les instructions standard comme <a href etc
  471.                         // NOTE: le code javascript autogΘnΘrΘ n'est pas pris en compte!!
  472.                         // (et ne marche pas dans 50% des cas de toute facon!)
  473.                         if (check_this_fking_line==1) {
  474.                           p=(int) (b - adr);    // calculer saut!
  475.                         } else {
  476.                           inscriptgen=1;        // SCRIPTGEN actif
  477.                           adr=b;                // jump
  478.                         }
  479.                         
  480.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  481.                           char str[512];
  482.                           str[0]='\0';
  483.                           strncat(str,b,minimum((int) (a - b + 1), 32));
  484.                           fspc(opt.log,"debug"); fprintf(opt.log,"active code (%s) detected in javascript: %s"LF,(check_this_fking_line==2)?"parse":"pickup",str); test_flush;
  485.                         }
  486.                       }
  487.                       
  488.                     }
  489.                     
  490.                   }
  491.                   
  492.                   
  493.                 }
  494.               }
  495.             }
  496.             // fin detection code gΘnΘrant javascript vers html
  497.             // ------------------------------
  498.             
  499.             
  500.             // analyse proprement dite, A HREF=.. etc..
  501.             if (!p) {
  502.               // si dans un tag, et pas dans un script - sauf si on analyse un obj.write("..
  503.               if ((intag && (!inscript)) || inscriptgen) {
  504.                 if ( (*(adr-1)=='<') || (is_space(*(adr-1))) ) {   // <tag < tag etc
  505.                   // <A HREF=.. pour les liens HTML
  506.                   p=rech_tageq(adr,"href");
  507.                   if (p) {    // href.. tester si c'est une bas href!
  508.                     if ((intag_start_valid) && check_tag(intag_start,"base")) {  // oui!
  509.                       // ** note: base href et codebase ne font pas bon mΘnage..
  510.                       p_type=2;    // c'est un chemin
  511.                     }
  512.                   }
  513.                   
  514.                   /* Tags supplΘmentaires α vΘrifier (<img src=..> etc) */
  515.                   if (p==0) {
  516.                     int i=0;
  517.                     while( (p==0) && (strnotempty(hts_detect[i])) ) {
  518.                       p=rech_tageq(adr,hts_detect[i]);
  519.                       i++;
  520.                     }
  521.                   }
  522.  
  523.                   /* Tags supplΘmentaires en dΘbut α vΘrifier (<object .. hotspot1=..> etc) */
  524.                   if (p==0) {
  525.                     int i=0;
  526.                     while( (p==0) && (strnotempty(hts_detectbeg[i])) ) {
  527.                       p=rech_tageqbegdigits(adr,hts_detectbeg[i]);
  528.                       i++;
  529.                     }
  530.                   }
  531.                   
  532.                   /* Tags supplΘmentaires α vΘrifier : URL=.. */
  533.                   if (p==0) {
  534.                     int i=0;
  535.                     while( (p==0) && (strnotempty(hts_detectURL[i])) ) {
  536.                       p=rech_tageq(adr,hts_detectURL[i]);
  537.                       i++;
  538.                     }
  539.                     if (p)
  540.                       p_searchMETAURL=1;
  541.                   }
  542.                   
  543.                   /* Tags supplΘmentaires α vΘrifier, mais α ne pas capturer */
  544.                   if (p==0) {
  545.                     int i=0;
  546.                     while( (p==0) && (strnotempty(hts_detectandleave[i])) ) {
  547.                       p=rech_tageq(adr,hts_detectandleave[i]);
  548.                       i++;
  549.                     }
  550.                     if (p)
  551.                       p_nocatch=1;      /* ne pas rechercher */
  552.                   }
  553.                   
  554.                   /* EvΘnements */
  555.                   if (p==0) {
  556.                     int i=0;
  557.                     /* dΘtection onLoad etc */
  558.                     while( (p==0) && (strnotempty(hts_detect_js[i])) ) {
  559.                       p=rech_tageq(adr,hts_detect_js[i]);
  560.                       i++;
  561.                     }
  562.                     /* non dΘtectΘ - dΘtecter Θgalement les onXxxxx= */
  563.                     if (p==0) {
  564.                       if ( (*adr=='o') && (*(adr+1)=='n') && isUpperLetter(*(adr+2)) ) {
  565.                         p=0;
  566.                         while(isalpha((unsigned char)adr[p]) && (p<64) ) p++;
  567.                         if (p<64) {
  568.                           while(is_space(adr[p])) p++;
  569.                           if (adr[p]=='=')
  570.                             p++;
  571.                           else p=0;
  572.                         } else p=0;
  573.                       }
  574.                     }
  575.                     /* OK, ΘvΘnement repΘrΘ */
  576.                     if (p) {
  577.                       inscript_tag_lastc=*(adr+p);     /* α attendre α la fin */
  578.                       adr+=p;     /* saut */
  579.                                   /*
  580.                                   On est dΘsormais dans du code javascript
  581.                       */
  582.                       inscript_tag=inscript=1;
  583.                     }
  584.                     p=0;        /* quoi qu'il arrive, ne rien dΘmarrer ici */
  585.                   }
  586.                   
  587.                   // <APPLET CODE=.. pour les applet java.. [CODEBASE (chemin..) α faire]
  588.                   if (p==0) {
  589.                     p=rech_tageq(adr,"code");
  590.                     if (p) {
  591.                       if ((intag_start_valid) && check_tag(intag_start,"applet")) {  // dans un <applet !
  592.                         p_type=-1;  // juste le nom de fichier+dossier, Θcire avant codebase 
  593.                         add_class=1;   // ajouter .class au besoin                         
  594.                         
  595.                         // vΘrifier qu'il n'y a pas de codebase APRES
  596.                         // sinon on swappe les deux.
  597.                         // pas trΦs propre mais c'est ce qu'il y a de plus simple α faire!!
  598.                         
  599.                         {
  600.                           char *a;
  601.                           a=adr;
  602.                           while((*a) && (*a!='>') && (!rech_tageq(a,"codebase"))) a++;
  603.                           if (rech_tageq(a,"codebase")) {  // banzai! codebase=
  604.                             char* b;
  605.                             b=strchr(a,'>');
  606.                             if (b) {
  607.                               if (((int) (b - adr)) < 1000) {    // au total < 1Ko
  608.                                 char tempo[HTS_URLMAXSIZE*2];
  609.                                 tempo[0]='\0';
  610.                                 strncat(tempo,a,(int) (b - a) );
  611.                                 strcat( tempo," ");
  612.                                 strncat(tempo,adr,(int) (a - adr - 1));
  613.                                 // Θventuellement remplire par des espaces pour avoir juste la taille
  614.                                 while((int) strlen(tempo)<((int) (b - adr)))
  615.                                   strcat(tempo," ");
  616.                                 // pas d'erreur?
  617.                                 if ((int) strlen(tempo) == ((int) (b - adr) )) {
  618.                                   strncpy(adr,tempo,strlen(tempo));   // PAS d'octet nul α la fin!
  619.                                   p=0;    // DEVALIDER!!
  620.                                   p_type=0;
  621.                                   add_class=0;
  622.                                 }
  623.                               }
  624.                             }
  625.                           }
  626.                         }
  627.                         
  628.                       }
  629.                     }
  630.                   }
  631.                   
  632.                   // liens α patcher mais pas α charger (ex: codebase)
  633.                   if (p==0) {  // note: si non chargΘ (ex: ignorer .class) patchΘ tout de mΩme
  634.                     p=rech_tageq(adr,"codebase");
  635.                     if (p) {
  636.                       if ((intag_start_valid) && check_tag(intag_start,"applet")) {  // dans un <applet !
  637.                         p_type=-2;
  638.                       } else p=-1;   // ne plus chercher
  639.                     }
  640.                   }
  641.                   
  642.                   
  643.                   // Meta tags pour robots
  644.                   if (p==0) {
  645.                     if (opt.robots) {
  646.                       if ((intag_start_valid) && check_tag(intag_start,"meta")) {
  647.                         if (rech_tageq(adr,"name")) {    // name=robots.txt
  648.                           char tempo[1100];
  649.                           char* a;
  650.                           tempo[0]='\0';
  651.                           a=strchr(adr,'>');
  652. #if DEBUG_ROBOTS
  653.                           printf("robots.txt meta tag detected\n");
  654. #endif
  655.                           if (a) {
  656.                             if (((int) (a - adr)) < 999 ) {
  657.                               strncat(tempo,adr,(int) (a - adr));
  658.                               if (strstrcase(tempo,"content")) {
  659.                                 if (strstrcase(tempo,"robots")) {
  660.                                   if (strstrcase(tempo,"nofollow")) {
  661. #if DEBUG_ROBOTS
  662.                                     printf("robots.txt meta tag: nofollow in %s%s\n",urladr,urlfil);
  663. #endif
  664.                                     nofollow=1;       // NE PLUS suivre liens dans cette page
  665.                                     if (opt.errlog) {
  666.                                       fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s%s not scanned (follow robots meta tag)"LF,urladr,urlfil);
  667.                                       test_flush;
  668.                                     }
  669.                                   }
  670.                                 }
  671.                               }
  672.                             }
  673.                           }
  674.                         }
  675.                       }
  676.                     }
  677.                   }
  678.                   
  679.                   // entrΘe dans une applet javascript
  680.                   /*if (!inscript) {  // sinon on est dans un obj.write("..
  681.                   if (p==0)
  682.                   if (rech_sampletag(adr,"script"))
  683.                   if (check_tag(intag_start,"script")) {
  684.                   inscript=1;
  685.                   }
  686.                         }*/
  687.                   
  688.                   // Ici on procΦde α une analyse du code javascript pour tenter de rΘcupΘrer
  689.                   // certains fichiers Θvidents.
  690.                   // C'est devenu obligatoire vu le nombre de pages qui intΦgrent
  691.                   // des images rΘactives par exemple
  692.                 }
  693.               } else if (inscript) {
  694.                 if (
  695.                   (
  696.                   (strfield(adr,"/script"))
  697.                   ||
  698.                   (strfield(adr,"/style"))
  699.                   )
  700.                   ) {
  701.                   char* a=adr;
  702.                   //while(is_realspace(*(--a)));
  703.                   while( is_realspace(*a) ) a--;
  704.                   a--;
  705.                   if (*a=='<') {  // s√r que c'est un tag?
  706.                     inscript=0;
  707.                   }
  708.                 } else {
  709.                   /*
  710.                   Script Analyzing - different types supported:
  711.                     foo="url"
  712.                     foo("url") or foo(url)
  713.                     foo "url"
  714.                   */
  715.                   int nc;
  716.                   char  expected     = '=';          // caractΦre attendu aprΦs
  717.                   char* expected_end = ";";
  718.                   int can_avoid_quotes=0;
  719.                   char quotes_replacement='\0';
  720.                   if (inscript_tag)
  721.                     expected_end=";\"\'";            // voir a href="javascript:doc.location='foo'"
  722.                   nc = strfield(adr,".src");  // nom.src="image";
  723.                   if (!nc) nc = strfield(adr,".location");  // document.location="doc"
  724.                   if (!nc) nc = strfield(adr,".href");  // document.location="doc"
  725.                   if (!nc) if ( (nc = strfield(adr,".open")) ) { // window.open("doc",..
  726.                     expected='(';    // parenthΦse
  727.                     expected_end="),";  // fin: virgule ou parenthΦse
  728.                   }
  729.                   if (!nc) if ( (nc = strfield(adr,".replace")) ) { // window.replace("url")
  730.                     expected='(';    // parenthΦse
  731.                     expected_end=")";  // fin: parenthΦse
  732.                   }
  733.                   if (!nc) if ( (nc = strfield(adr,".link")) ) { // window.link("url")
  734.                     expected='(';    // parenthΦse
  735.                     expected_end=")";  // fin: parenthΦse
  736.                   }
  737.                   if (!nc) if ( (nc = strfield(adr,"url")) ) { // url(url)
  738.                     expected='(';    // parenthΦse
  739.                     expected_end=")";  // fin: parenthΦse
  740.                     can_avoid_quotes=1;
  741.                     quotes_replacement=')';
  742.                   }
  743.                   if (!nc) if ( (nc = strfield(adr,"import")) ) { // import "url"
  744.                     if (is_space(*(adr+nc))) {
  745.                       expected=0;    // no char expected
  746.                     } else
  747.                       nc=0;
  748.                   }
  749.                   if (nc) {
  750.                     char *a;
  751.                     a=adr+nc;
  752.                     while(is_realspace(*a)) a++;
  753.                     if ((*a == expected) || (!expected)) {
  754.                       if (expected)
  755.                         a++;
  756.                       while(is_realspace(*a)) a++;
  757.                       if ((*a==34) || (*a=='\'') || (can_avoid_quotes)) {
  758.                         char *b,*c;
  759.                         int ndelim=1;
  760.                         if ((*a==34) || (*a=='\''))
  761.                           a++;
  762.                         else
  763.                           ndelim=0;
  764.                         b=a;
  765.                         if (ndelim) {
  766.                           while((*b!=34) && (*b!='\'') && (*b!='\0')) b++;
  767.                         }
  768.                         else {
  769.                           while((*b != quotes_replacement) && (*b!='\0')) b++;
  770.                         }
  771.                         c=b--; c+=ndelim;
  772.                         while(*c==' ') c++;
  773.                         if ((strchr(expected_end,*c)) || (*c=='\n') || (*c=='\r')) {
  774.                           c-=(ndelim+1);
  775.                           if ((int) (c - a + 1)) {
  776.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  777.                               char str[512];
  778.                               str[0]='\0';
  779.                               strncat(str,a,minimum((int) (c - a + 1),32));
  780.                               fspc(opt.log,"debug"); fprintf(opt.log,"link detected in javascript: %s"LF,str); test_flush;
  781.                             }
  782.                             p=(int) (a - adr);    // p non nul: TRAITER CHAINE COMME FICHIER
  783.                             if (can_avoid_quotes) {
  784.                               ending_p=quotes_replacement;
  785.                             }
  786.                           }
  787.                         }
  788.                         
  789.                         
  790.                       }
  791.                     }
  792.                   }
  793.                   
  794.                 }
  795.               }
  796.             }
  797.             
  798.           } else {      // ptr == 0
  799.             //p=rech_tageq(adr,"primary");    // lien primaire, yeah
  800.             p=0;          // No stupid tag anymore, raw link
  801.             valid_p=1;    // Valid even if p==0
  802.             while ((adr[p] == '\r') || (adr[p] == '\n'))
  803.               p++;
  804.             //can_avoid_quotes=1;
  805.             ending_p='\r';
  806.           }       
  807.           
  808.         } else if (isspace((unsigned char)*adr)) {
  809.           intag_startattr=adr+1;        // attribute in tag (for dirty parsing)
  810.         }
  811.           
  812.           
  813.           // ------------------------------------------------------------
  814.           // dernier recours - parsing "sale" : dΘtection systΘmatique des .gif, etc.
  815.           // risque: gΘnΘrer de faux fichiers parazites
  816.           // fix: ne parse plus dans les commentaires
  817.           // ------------------------------------------------------------
  818.           if ( (opt.parseall) && (ptr>0) && (!in_media) ) {           // option parsing "brut"
  819.             int incomment_justquit=0;
  820.             if (!is_realspace(*adr)) {
  821.               int noparse=0;
  822.  
  823.               // Gestion des /* */
  824.               if (inscript) {
  825.                 if (parseall_incomment) {
  826.                   if ((*adr=='/') && (*(adr-1)=='*'))
  827.                     parseall_incomment=0;
  828.                   incomment_justquit=1;       // ne pas noter dernier caractΦre
  829.                 } else {
  830.                   if ((*adr=='/') && (*(adr+1)=='*'))
  831.                     parseall_incomment=1;
  832.                 }
  833.               } else
  834.                 parseall_incomment=0;
  835.  
  836.               /* vΘrifier que l'on est pas dans un <!-- --> pur */
  837.               if ( (!intag) && (incomment) && (!inscript))
  838.                 noparse=1;        /* commentaire */
  839.  
  840.               // recherche d'URLs
  841.               if ((!parseall_incomment) && (!noparse)) {
  842.                 if (!p) {                   // non dΘja trouvΘ
  843.                   if (adr != r.adr) {     // >1 caractΦre
  844.                     // scanner les chaines
  845.                     if ((*adr == '\"') || (*adr=='\'')) {         // "xx.gif" 'xx.gif'
  846.                       if (strchr("=(,",parseall_lastc)) {    // exemple: a="img.gif..
  847.                         char *a=adr;
  848.                         char stop=*adr;  // " ou '
  849.                         int count=0;
  850.                         
  851.                         // sauter caractΦres
  852.                         a++;
  853.                         // copier
  854.                         while((*a) && (*a!='\'') && (*a!='\"') && (count<HTS_URLMAXSIZE)) { count++; a++; }
  855.                         
  856.                         // ok chaine terminΘe par " ou '
  857.                         if ((*a == stop) && (count<HTS_URLMAXSIZE) && (count>0)) {
  858.                           char c;
  859.                           char* aend;
  860.                           //
  861.                           aend=a;     // sauver dΘbut
  862.                           a++;
  863.                           while(is_taborspace(*a)) a++;
  864.                           c=*a;
  865.                           if (strchr("),;>/+\r\n",c)) {     // exemple: ..img.gif";
  866.                             // le / est pour funct("img.gif" /* URL */);
  867.                             char tempo[HTS_URLMAXSIZE*2];
  868.                             char type[256];
  869.                             int url_ok=0;      // url valide?
  870.                             tempo[0]='\0'; type[0]='\0';
  871.                             //
  872.                             strncat(tempo,adr+1,count);
  873.                             //
  874.                             if ((!strchr(tempo,' ')) || inscript) {   // espace dedans: mΘfiance! (sauf dans code javascript)
  875.                               int invalid_url=0;
  876.                               
  877.                               // Couper au # ou ? Θventuel
  878.                               {
  879.                                 char* a=strchr(tempo,'#');
  880.                                 if (a)
  881.                                   *a='\0';
  882.                                 a=strchr(tempo,'?');
  883.                                 if (a)
  884.                                   *a='\0';
  885.                               }
  886.  
  887.                               // vΘrifier qu'il n'y a pas de caractΦres spΘciaux
  888.                               if (!strnotempty(tempo))
  889.                                 invalid_url=1;
  890.                               else if (strchr(tempo,'*')
  891.                                 || strchr(tempo,'<')
  892.                                 || strchr(tempo,'>'))
  893.                                 invalid_url=1;
  894.                               
  895.                               /* non invalide? */
  896.                               if (!invalid_url) {
  897.                                 // Un plus α la fin? Alors ne pas prendre sauf si extension ("/toto.html#"+tag)
  898.                                 if (c!='+') {    // PAS de plus α la fin
  899.                                   char* a;
  900.                                   // "Comparisons of scheme names MUST be case-insensitive" (RFC2616)                                  
  901.                                   //if ((strncmp(tempo,"http://",7)==0) || (strncmp(tempo,"ftp://",6)==0))  // ok pas de problΦme
  902.                                   if (
  903.                                        (strfield(tempo,"http:")) 
  904.                                     || (strfield(tempo,"ftp:"))
  905. #if HTS_USEOPENSSL
  906.                                     || (strfield(tempo,"https:"))
  907. #endif
  908.                                     )  // ok pas de problΦme
  909.                                     url_ok=1;
  910.                                   else if (tempo[strlen(tempo)-1]=='/') {        // un slash: ok..
  911.                                     if (inscript)   // sinon si pas javascript, mΘfiance (rΘpertoire style base?)
  912.                                       url_ok=1;
  913.                                   } else if ((a=strchr(tempo,'/'))) {        // un slash: ok..
  914.                                     if (inscript) {    // sinon si pas javascript, mΘfiance (style "text/css")
  915.                                       if (strchr(a+1,'/'))  // un seul / : abandon (STYLE type='text/css')
  916.                                         url_ok=1;
  917.                                     }
  918.                                   }
  919.                                 }
  920.                                 // Prendre si extension reconnue
  921.                                 if (!url_ok) {
  922.                                   get_httptype(type,tempo,0);
  923.                                   if (strnotempty(type))     // type reconnu!
  924.                                     url_ok=1;
  925.                                   else if (is_dyntype(get_ext(tempo)))  // reconnu php,cgi,asp..
  926.                                     url_ok=1;
  927.                                   // MAIS pas les foobar@aol.com !!
  928.                                   if (strchr(tempo,'@'))
  929.                                     url_ok=0;
  930.                                 }
  931.                                 //
  932.                                 // Ok, cela pourrait Ωtre une URL
  933.                                 if (url_ok) {
  934.                                   
  935.                                   // Check if not fodbidden tag (id,name..)
  936.                                   if (intag_start_valid) {
  937.                                     if (intag_start)
  938.                                       if (intag_startattr)
  939.                                         if (intag)
  940.                                           if (!inscript)
  941.                                             if (!incomment) {
  942.                                               int i=0,nop=0;
  943.                                               while( (nop==0) && (strnotempty(hts_nodetect[i])) ) {
  944.                                                 nop=rech_tageq(intag_startattr,hts_nodetect[i]);
  945.                                                 i++;
  946.                                               }
  947.                                               // Forbidden tag
  948.                                               if (nop) {
  949.                                                 url_ok=0;
  950.                                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  951.                                                   fspc(opt.log,"debug"); fprintf(opt.log,"dirty parsing: bad tag avoided: %s"LF,hts_nodetect[i-1]); test_flush;
  952.                                                 }
  953.                                               }
  954.                                             }
  955.                                   }
  956.                                   
  957.                                   
  958.                                   // Accepter URL, on la traitera comme une URL normale!!
  959.                                   if (url_ok)
  960.                                     p=1;
  961.  
  962.                                 }
  963.                               }
  964.                             }
  965.                           }
  966.                         }
  967.                       }
  968.                     }
  969.                   }
  970.                 }  // p == 0
  971.                 
  972.                 // plus dans un commentaire
  973.                 if (!incomment_justquit)
  974.                   parseall_lastc=*adr;             // caractΦre avant le prochain
  975.                 
  976.               } // not in comment
  977.               
  978.             }  // if realspace
  979.           }  // if parseall
  980.           
  981.           
  982.           // ------------------------------------------------------------
  983.           // p!=0 : on a repΘrΘ un Θventuel lien
  984.           // ------------------------------------------------------------
  985.           //
  986.           if ((p>0) || (valid_p)) {    // on a repΘrΘ un lien
  987.             //int lien_valide=0;
  988.             char* eadr=NULL;          /* fin de l'URL */
  989.             char* quote_adr=NULL;     /* adresse du ? dans l'adresse */
  990.             int ok=1;
  991.             char quote='\0';
  992.             
  993.             // si nofollow ou un stop a ΘtΘ dΘclenchΘ, rΘΘcrire tous les liens en externe
  994.             if ((nofollow) || (opt.state.stop))
  995.               p_nocatch=1;
  996.  
  997.             // Θcrire codebase avant, flusher avant code
  998.             if ((p_type==-1) || (p_type==-2)) {
  999.               if ((opt.getmode & 1) && (ptr>0)) {
  1000.                 HT_ADD_ADR;    // refresh
  1001.               }
  1002.               lastsaved=adr;    // dernier Θcrit+1
  1003.             }
  1004.             
  1005.             // sauter espaces
  1006.             adr+=p;
  1007.             while((is_space(*adr)) && (quote=='\0')) {
  1008.               if (!quote)
  1009.                 if ((*adr=='\"') || (*adr=='\''))
  1010.                   quote=*adr;                     // on doit attendre cela α la fin
  1011.                                                   // puis quitter
  1012.                 adr++;    // sauter les espaces, "" et cie
  1013.             }
  1014.  
  1015.             /* Stop at \n (LF) if primary links*/
  1016.             if (ptr == 0)
  1017.               quote='\n';
  1018.             /* s'arrΩter que ce soit un ' ou un " : pour document.write('<img src="foo'+a); par exemple! */
  1019.             else if (inscript)
  1020.               quote='\0';
  1021.             
  1022.             // sauter Θventuel \" ou \' javascript
  1023.             if (inscript) {    // on est dans un obj.write("..
  1024.               if (*adr=='\\') {
  1025.                 if ((*(adr+1)=='\'') || (*(adr+1)=='"')) {  // \" ou \'
  1026.                   adr+=2;    // sauter
  1027.                 }
  1028.               }
  1029.             }
  1030.             
  1031.             // sauter content="1;URL=http://..
  1032.             if (p_searchMETAURL) {
  1033.               int l=0;
  1034.               while(!strfield(adr+l,"URL=") && (l<128) ) l++;
  1035.               if (!strfield(adr+l,"URL="))
  1036.                 ok=-1;
  1037.               else
  1038.                 adr+=(l+4);
  1039.             }
  1040.  
  1041.             /* Θviter les javascript:document.location=.. : les parser, plut⌠t */
  1042.             if (ok!=-1) {
  1043.               if (strfield(adr,"javascript:")) {
  1044.                 ok=-1;
  1045.                 /*
  1046.                 On est dΘsormais dans du code javascript
  1047.                 */
  1048.                 inscript_tag=inscript=1;
  1049.                 inscript_tag_lastc=quote;     /* α attendre α la fin */
  1050.               }
  1051.             }
  1052.             
  1053.             if (p_type==1) {
  1054.               if (*adr=='#') {
  1055.                 adr++;           // sauter # pour usemap etc
  1056.               }
  1057.             }
  1058.             eadr=adr;
  1059.             
  1060.             // ne pas flusher aprΦs code si on doit Θcrire le codebase avant!
  1061.             if ((p_type!=-1) && (p_type!=2) && (p_type!=-2)) {
  1062.               if ((opt.getmode & 1) && (ptr>0)) {
  1063.                 HT_ADD_ADR;    // refresh
  1064.               }
  1065.               lastsaved=adr;    // dernier Θcrit+1
  1066.               // aprΦs on Θcrira soit les donnΘes initiales,
  1067.               // soir une URL/lien modifiΘ!
  1068.             } else if (p_type==-1) p_flush=adr;    // flusher jusqu'α adr ensuite
  1069.             
  1070.             if (ok!=-1) {    // continuer
  1071.               // dΘcouper le lien
  1072.               do {
  1073.                 if ((* (unsigned char*) eadr)<32) {   // caractΦre de contr⌠le (ou \0)
  1074.                   if (!is_space(*eadr))
  1075.                     ok=0; 
  1076.                 }
  1077.                 if ( ( ((int) (eadr - adr)) ) > HTS_URLMAXSIZE)  // ** trop long, >HTS_URLMAXSIZE caractΦres (on prΘvoit HTS_URLMAXSIZE autres pour path)
  1078.                   ok=-1;    // ne pas traiter ce lien
  1079.                 
  1080.                 if (ok) {
  1081.                   //if (*eadr!=' ') {  
  1082.                   if (is_space(*eadr)) {   // guillemets,CR, etc
  1083.                     if ((!quote) || (*eadr==quote))     // si pas d'attente de quote spΘciale ou si quote atteinte
  1084.                       ok=0; 
  1085.                   } else if (ending_p && (*eadr==ending_p))
  1086.                     ok=0;
  1087.                   else {
  1088.                     switch(*eadr) {
  1089.                     case '>': 
  1090.                       if (!quote) {
  1091.                         if (!inscript) {
  1092.                           intag=0;    // PLUS dans un tag!
  1093.                           intag_start_valid=0;
  1094.                         }
  1095.                         ok=0;
  1096.                       }
  1097.                       break;
  1098.                       /*case '<':*/ case '#': ok=0; break;    // case '?': non!
  1099.                     case '\\': if (inscript) ok=0; break;     // \" ou \' point d'arrΩt
  1100.                     case '?': quote_adr=adr; break;           // noter position query
  1101.                     }
  1102.                   }
  1103.                   //}
  1104.                 } 
  1105.                 eadr++;
  1106.               } while(ok==1);     
  1107.               
  1108.               // Empty link detected
  1109.               if ( (((int) (eadr - adr))) <= 1) {       // link empty
  1110.                 ok=-1;        // No
  1111.                 if (*adr != '#') {        // Not empty+unique #
  1112.                   if ( (((int) (eadr - adr)) == 1)) {       // 1=link empty with delim (end_adr-start_adr)
  1113.                     if (quote) {
  1114.                       if ((opt.getmode & 1) && (ptr>0)) { 
  1115.                         HT_ADD("#");        // We add this for a <href="">
  1116.                       }
  1117.                     }
  1118.                   }
  1119.                 }
  1120.               }
  1121.               
  1122.             }
  1123.             
  1124.             if (ok==0) {    // tester un lien
  1125.               char lien[HTS_URLMAXSIZE*2];
  1126.               int meme_adresse=0;      // 0 par dΘfaut pour primary
  1127.               //char *copie_de_adr=adr;
  1128.               //char* p;
  1129.               
  1130.               // construire lien (dΘcoupage)
  1131.               if ( (((int) (eadr -  adr))-1) < HTS_URLMAXSIZE  ) {    // pas trop long?
  1132.                 strncpy(lien,adr,((int) (eadr - adr))-1);
  1133.                 *(lien+  (((int) (eadr -  adr)))-1  )='\0';
  1134.                 //printf("link: %s\n",lien);          
  1135.                 // supprimer les espaces
  1136.                 while((lien[strlen(lien)-1]==' ') && (strnotempty(lien))) lien[strlen(lien)-1]='\0';
  1137.  
  1138.                 
  1139. #if HTS_STRIP_DOUBLE_SLASH
  1140.                 // supprimer les // en / (sauf pour http://)
  1141.                 {
  1142.                   char *a,*p,*q;
  1143.                   int done=0;
  1144.                   a=strchr(lien,':');    // http://
  1145.                   if (a) {
  1146.                     a++;
  1147.                     while(*a=='/') a++;    // position aprΦs http://
  1148.                   } else {
  1149.                     a=lien;                // dΘbut
  1150.                     while(*a=='/') a++;    // position aprΦs http://
  1151.                   }
  1152.                   q=strchr(a,'?');     // ne pas traiter aprΦs '?'
  1153.                   if (!q)
  1154.                     q=a+strlen(a)-1;
  1155.                   while(( p=strstr(a,"//")) && (!done) ) {    // remplacer // par /
  1156.                     if ((int) p>(int) q) {   // aprΦs le ? (toto.cgi?param=1//2.3)
  1157.                       done=1;    // stopper
  1158.                     } else {
  1159.                       char tempo[HTS_URLMAXSIZE*2];
  1160.                       tempo[0]='\0';
  1161.                       strncat(tempo,a,(int) p - (int) a);
  1162.                       strcat (tempo,p+1);
  1163.                       strcpy(a,tempo);    // recopier
  1164.                     }
  1165.                   }
  1166.                 }
  1167. #endif
  1168.  
  1169.               } else
  1170.                 lien[0]='\0';    // erreur
  1171.               
  1172.               // ------------------------------------------------------
  1173.               // Lien repΘrΘ et extrait
  1174.               if (strnotempty(lien)>0) {           // construction du lien
  1175.                 char adr[HTS_URLMAXSIZE*2],fil[HTS_URLMAXSIZE*2];          // ATTENTION adr cache le "vrai" adr
  1176.                 int forbidden_url=-1;              // lien non interdit (mais non autorisΘ..)
  1177.                 int just_test_it=0;                // mode de test des liens
  1178.                 int set_prio_to=0;                 // pour capture de page isolΘe
  1179.                 int import_done=0;                 // lien importΘ (ne pas scanner ensuite *α priori*)
  1180.                 //
  1181.                 adr[0]='\0'; fil[0]='\0';
  1182.                 //
  1183.                 // 0: autorisΘ
  1184.                 // 1: interdit (patcher tout de mΩme adresse)
  1185.                 
  1186.                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1187.                   fspc(opt.log,"debug"); fprintf(opt.log,"link detected in html: %s"LF,lien); test_flush;
  1188.                 }
  1189.  
  1190.                 // external check
  1191. #if HTS_ANALYSTE
  1192.                 if (!hts_htmlcheck_linkdetected(lien)) {
  1193.                   error=1;    // erreur
  1194.                   if (opt.errlog) {
  1195.                     fspc(opt.errlog,"error"); fprintf(opt.errlog,"Link %s refused by external wrapper"LF,lien);
  1196.                     test_flush;
  1197.                   }
  1198.                 }
  1199. #endif
  1200.                 
  1201.                 // purger espaces de dΘbut et fin, CR,LF rΘsiduels
  1202.                 // (IMG SRC="foo.<\n>gif")
  1203.                 {
  1204.                   char* a;
  1205.                   while (is_realspace(lien[0])) {
  1206.                     char tempo[HTS_URLMAXSIZE*2];
  1207.                     tempo[0]='\0';
  1208.                     strcpy(tempo,lien+1);
  1209.                     strcpy(lien,tempo);
  1210.                   }
  1211.                   while(strnotempty(lien)
  1212.                         && (is_realspace(lien[max(0,(int)(strlen(lien))-1)])) ) {
  1213.                     lien[strlen(lien)-1]='\0';
  1214.                   } 
  1215.                   while ((a=strchr(lien,'\n'))) {
  1216.                     char tempo[HTS_URLMAXSIZE*2];
  1217.                     tempo[0]='\0';
  1218.                     strncat(tempo,lien,(int) (a - lien));
  1219.                     strcat(tempo,a+1);
  1220.                     strcpy(lien,tempo);
  1221.                   }
  1222.                   while ((a=strchr(lien,'\r'))) {
  1223.                     char tempo[HTS_URLMAXSIZE*2];
  1224.                     tempo[0]='\0';
  1225.                     strncat(tempo,lien,(int) (a - lien));
  1226.                     strcat(tempo,a+1);
  1227.                     strcpy(lien,tempo);
  1228.                   }
  1229.                 }
  1230.                 
  1231.                 /* Unescape/escape %20 and other   */
  1232.                 {
  1233.                   char query[HTS_URLMAXSIZE*2];
  1234.                   char* a=strchr(lien,'?');
  1235.                   if (a) {
  1236.                     strcpy(query,a);
  1237.                     *a='\0';
  1238.                   } else
  1239.                     query[0]='\0';
  1240.                   // conversion & -> & et autres joyeusetΘs
  1241.                   unescape_amp(lien);
  1242.                   unescape_amp(query);
  1243.                   // dΘcoder l'inutile (%2E par exemple) et coder espaces
  1244.                   // XXXXXXXXXXXXXXXXX strcpy(lien,unescape_http(lien));
  1245.                   strcpy(lien,unescape_http_unharm(lien, (no_esc_utf)?0:1));
  1246.                   escape_spc_url(lien);
  1247.                   strcat(lien,query);     /* restore */
  1248.                 }
  1249.                 
  1250.                 // convertir les Θventuels \ en des / pour Θviter des problΦmes de reconnaissance!
  1251.                 {
  1252.                   char* a=jump_identification(lien);
  1253.                   while( (a=strchr(a,'\\')) ) *a='/';
  1254.                 }
  1255.                 
  1256.                 // supprimer le(s) ./
  1257.                 while ((lien[0]=='.') && (lien[1]=='/')) {
  1258.                   char tempo[HTS_URLMAXSIZE*2];
  1259.                   strcpy(tempo,lien+2);
  1260.                   strcpy(lien,tempo);
  1261.                 }
  1262.                 if (strnotempty(lien)==0)  // sauf si plus de nom de fichier
  1263.                   strcpy(lien,"./");
  1264.                 
  1265.                 // vΘrifie les /~machin -> /~machin/
  1266.                 // supposition dangereuse?
  1267.                 // OUI!!
  1268. #if HTS_TILDE_SLASH
  1269.                 if (lien[strlen(lien)-1]!='/') {
  1270.                   char *a=lien+strlen(lien)-1;
  1271.                   // Θviter aussi index~1.html
  1272.                   while (((int) a>(int) lien) && (*a!='~') && (*a!='/') && (*a!='.')) a--;
  1273.                   if (*a=='~') {
  1274.                     strcat(lien,"/");    // ajouter slash
  1275.                   }
  1276.                 }
  1277. #endif
  1278.                 
  1279.                 // APPLET CODE="mixer.MixerApplet.class" --> APPLET CODE="mixer/MixerApplet.class"
  1280.                 // yes, this is dirty
  1281.                 // but I'm so lazzy..
  1282.                 // and besides the java "code" convention is really a pain in html code
  1283.                 if (p_type==-1) {
  1284.                   char* a=strrchr(lien,'.');
  1285.                   add_class_dots_to_patch=0;
  1286.                   if (a) {
  1287.                     char* b;
  1288.                     do {
  1289.                       b=strchr(lien,'.');
  1290.                       if ((b != a) && (b)) {
  1291.                         add_class_dots_to_patch++;
  1292.                         *b='/';
  1293.                       }
  1294.                     } while((b != a) && (b));
  1295.                   }
  1296.                 }
  1297.                 
  1298.                 // Θliminer les Θventuels :80 (port par dΘfaut!)
  1299.                 if (link_has_authority(lien)) {
  1300.                   char * a;
  1301.                   a=strstr(lien,"//");    // "//" authority
  1302.                   if (a)
  1303.                     a+=2;
  1304.                   else
  1305.                     a=lien;
  1306.                   // while((*a) && (*a!='/') && (*a!=':')) a++;
  1307.                   a=jump_toport(a);
  1308.                   if (a) {  // port
  1309.                     int port=0;
  1310.                     int defport=80;
  1311.                     char* b=a+1;
  1312. #if HTS_USEOPENSSL
  1313.                     // FIXME
  1314.                     //if (strfield(adr, "https:")) {
  1315.                     //}
  1316. #endif
  1317.                     while(isdigit((unsigned char)*b)) { port*=10; port+=(int) (*b-'0'); b++; }
  1318.                     if (port==defport) {  // port 80, default - c'est dΘbile
  1319.                       char tempo[HTS_URLMAXSIZE*2];
  1320.                       tempo[0]='\0';
  1321.                       strncat(tempo,lien,(int) (a - lien));
  1322.                       strcat(tempo,a+3);  // sauter :80
  1323.                       strcpy(lien,tempo);
  1324.                     }
  1325.                   }
  1326.                 }
  1327.                 
  1328.                 // filtrer les parazites (mailto & cie)
  1329.                 /*
  1330.                 if (strfield(lien,"mailto:")) {  // ne pas traiter
  1331.                   error=1;
  1332.                 } else if (strfield(lien,"news:")) {  // ne pas traiter
  1333.                   error=1;
  1334.                 }
  1335.                 */
  1336.                 
  1337.                 // vΘrifier que l'on ne doit pas ajouter de .class
  1338.                 if (!error) {
  1339.                   if (add_class) {
  1340.                     char *a = lien+strlen(lien)-1;
  1341.                     while(( a > lien) && (*a!='/') && (*a!='.')) a--;
  1342.                     if (*a != '.')
  1343.                       strcat(lien,".class");    // ajouter .class
  1344.                     else if (!strfield2(a,".class"))
  1345.                       strcat(lien,".class");    // idem
  1346.                   }
  1347.                 }
  1348.                 
  1349.                 // si c'est un chemin, alors vΘrifier (toto/toto.html -> http://www/toto/)
  1350.                 if (!error) {
  1351.                   if ((opt.debug>1) && (opt.log!=NULL)) {
  1352.                     fspc(opt.log,"debug"); fprintf(opt.log,"position link check %s"LF,lien); test_flush;
  1353.                   }
  1354.                   
  1355.                   if ((p_type==2) || (p_type==-2)) {   // code ou codebase                        
  1356.                     // VΘrifier les codebase=applet (au lieu de applet/)
  1357.                     if (p_type==-2) {    // codebase
  1358.                       if (strnotempty(lien)) {
  1359.                         if (fil[strlen(lien)-1]!='/') {  // pas rΘpertoire
  1360.                           strcat(lien,"/");
  1361.                         }
  1362.                       }
  1363.                     }
  1364.                     /* only one ending / (bug on some pages) */
  1365.                     if ((int)strlen(lien)>2) {
  1366.                       while( (lien[strlen(lien)-2]=='/') && ((int)strlen(lien)>2) )    /* double // (bug) */
  1367.                         lien[strlen(lien)-1]='\0';
  1368.                     }
  1369.                     // copier nom host si besoin est
  1370.                     if (!link_has_authority(lien)) {  // pas de http://
  1371.                       char adr2[HTS_URLMAXSIZE*2],fil2[HTS_URLMAXSIZE*2];  // ** euh ident_url_relatif??
  1372.                       if (ident_url_relatif(lien,urladr,urlfil,adr2,fil2)<0) {                        
  1373.                         error=1;
  1374.                       } else {
  1375.                         strcpy(lien,"http://");
  1376.                         strcat(lien,adr2);
  1377.                         if (*fil2!='/')
  1378.                           strcat(lien,"/");
  1379.                         strcat(lien,fil2);
  1380.                         {
  1381.                           char* a;
  1382.                           a=lien+strlen(lien)-1;
  1383.                           while((*a) && (*a!='/') && ( a> lien)) a--;
  1384.                           if (*a=='/') {
  1385.                             *(a+1)='\0';
  1386.                           }
  1387.                         }
  1388.                         //char tempo[HTS_URLMAXSIZE*2];
  1389.                         //strcpy(tempo,"http://");
  1390.                         //strcat(tempo,urladr);    // host
  1391.                         //if (*lien!='/')
  1392.                         //  strcat(tempo,"/");
  1393.                         //strcat(tempo,lien);
  1394.                         //strcpy(lien,tempo);
  1395.                       }
  1396.                     }
  1397.                     
  1398.                     if (!error) {  // pas d'erreur?
  1399.                       if (p_type==2) {   // code ET PAS codebase      
  1400.                         char* a=lien+strlen(lien)-1;
  1401.                         while( (a > lien) && (*a) && (*a!='/')) a--;
  1402.                         if (*a=='/')     // ok on a repΘrΘ le dernier /
  1403.                           *(a+1)='\0';   // couper
  1404.                         else {
  1405.                           *lien='\0';    // Θliminer
  1406.                           error=1;   // erreur, ne pas poursuivre
  1407.                         }      
  1408.                       }
  1409.                       
  1410.                       // stocker base ou codebase?
  1411.                       switch(p_type) {
  1412.                       case 2: { 
  1413.                         //if (*lien!='/') strcat(base,"/");
  1414.                         strcpy(base,lien);
  1415.                               }
  1416.                         break;      // base
  1417.                       case -2: {
  1418.                         //if (*lien!='/') strcat(codebase,"/");
  1419.                         strcpy(codebase,lien); 
  1420.                                }
  1421.                         break;  // base
  1422.                       }
  1423.                       
  1424.                       if ((opt.debug>1) && (opt.log!=NULL)) {
  1425.                         fspc(opt.log,"debug"); fprintf(opt.log,"code/codebase link %s base %s"LF,lien,base); test_flush;
  1426.                       }
  1427.                       //printf("base code: %s - %s\n",lien,base);
  1428.                     }
  1429.                     
  1430.                   } else {
  1431.                     char* _base;
  1432.                     if (p_type==-1)   // code (applet)
  1433.                       _base=codebase;
  1434.                     else
  1435.                       _base=base;
  1436.  
  1437.                     
  1438.                     // ajouter chemin de base href..
  1439.                     if (strnotempty(_base)) {       // considΘrer base
  1440.                       if (!link_has_authority(lien)) {    // non absolue
  1441.                         if (*lien!='/') {           // non absolu sur le site (/)
  1442.                           if ( ((int) strlen(_base)+(int) strlen(lien))<HTS_URLMAXSIZE) {
  1443.                             // mailto: and co: do NOT add base
  1444.                             if (ident_url_relatif(lien,urladr,urlfil,adr,fil)>=0) {
  1445.                               char tempo[HTS_URLMAXSIZE*2];
  1446.                               // base est absolue
  1447.                               strcpy(tempo,_base);
  1448.                               strcat(tempo,lien);
  1449.                               strcpy(lien,tempo);        // patcher en considΘrant base
  1450.                               // ** vΘrifier que ../ fonctionne (ne doit pas arriver mais bon..)
  1451.                               
  1452.                               if ((opt.debug>1) && (opt.log!=NULL)) {
  1453.                                 fspc(opt.log,"debug"); fprintf(opt.log,"link modified with code/codebase %s"LF,lien); test_flush;
  1454.                               }
  1455.                             }
  1456.                           } else {
  1457.                             error=1;    // erreur
  1458.                             if (opt.errlog) {
  1459.                               fspc(opt.errlog,"error"); fprintf(opt.errlog,"Link %s too long with base href"LF,lien);
  1460.                               test_flush;
  1461.                             }
  1462.                           }
  1463.                         }
  1464.                       }
  1465.                     }
  1466.                     
  1467.  
  1468.                   }
  1469.                   }
  1470.                   
  1471.                   
  1472.                   // transformer lien quelconque (http, relatif, etc) en une adresse
  1473.                   // et un chemin+fichier (adr,fil)
  1474.                   if (!error) {
  1475.                     int reponse;
  1476.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1477.                       fspc(opt.log,"debug"); fprintf(opt.log,"build relative link %s with %s%s"LF,lien,urladr,urlfil); test_flush;
  1478.                     }
  1479.                     if ((reponse=ident_url_relatif(lien,urladr,urlfil,adr,fil))<0) {                        
  1480.                       adr[0]='\0';    // erreur
  1481.                       if (reponse==-2) {
  1482.                         if (opt.errlog) {
  1483.                           fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s not caught (unknown ftp:// protocol)"LF,lien);
  1484.                           test_flush;
  1485.                         }
  1486.                       } else {
  1487.                         if ((opt.debug>1) && (opt.errlog!=NULL)) {
  1488.                           fspc(opt.errlog,"debug"); fprintf(opt.errlog,"ident_url_relatif failed for %s with %s%s"LF,lien,urladr,urlfil); test_flush;
  1489.                         }
  1490.                       }
  1491.                     }
  1492.                   } else {
  1493.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1494.                       fspc(opt.log,"debug"); fprintf(opt.log,"link %s not build, error detected before"LF,lien); test_flush;
  1495.                     }
  1496.                     adr[0]='\0';
  1497.                   }
  1498.                   
  1499. #if HTS_CHECK_STRANGEDIR
  1500.                   // !ATTENTION!
  1501.                   // Ici on teste les exotiques du genre www.truc.fr/machin (sans slash α la fin)
  1502.                   // je n'ai pas encore trouvΘ le moyen de faire la diffΘrence entre un rΘpertoire
  1503.                   // et un fichier en http A PRIORI : je fais donc un test
  1504.                   // En cas de moved xxx, on recalcule adr et fil, tout simplement
  1505.                   // DEFAUT: test effectuΘ plusieurs fois! α revoir!!!
  1506.                   if ((adr[0]!='\0') && (strcmp(adr,"file://") && (p_type!=2) && (p_type!=-2)) {
  1507.                     //## if ((adr[0]!='\0') && (adr[0]!=lOCAL_CHAR) && (p_type!=2) && (p_type!=-2)) {
  1508.                     if (fil[strlen(fil)-1]!='/') {  // pas rΘpertoire
  1509.                       if (ishtml(fil)==-2) {    // pas d'extension
  1510.                         char loc[HTS_URLMAXSIZE*2];  // Θventuelle nouvelle position
  1511.                         loc[0]='\0';
  1512.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1513.                           fspc(opt.log,"debug"); fprintf(opt.log,"link-check-directory: %s%s"LF,adr,fil);
  1514.                           test_flush;
  1515.                         }
  1516.                         
  1517.                         // tester Θventuelle nouvelle position
  1518.                         switch (http_location(adr,fil,loc).statuscode) {
  1519.                         case 200: // ok au final
  1520.                           if (strnotempty(loc)) {  // a changΘ d'adresse
  1521.                             if (opt.errlog) {
  1522.                               fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s%s has moved to %s for %s%s"LF,adr,fil,loc,urladr,urlfil);
  1523.                               test_flush;
  1524.                             }
  1525.                             
  1526.                             // recalculer adr et fil!
  1527.                             if (ident_url_absolute(loc,adr,fil)==-1) {
  1528.                               adr[0]='\0';  // cancel
  1529.                               if ((opt.debug>1) && (opt.log!=NULL)) {
  1530.                                 fspc(opt.log,"debug"); fprintf(opt.log,"link-check-dir: %s%s"LF,adr,fil);
  1531.                                 test_flush;
  1532.                               }
  1533.                             }
  1534.                             
  1535.                           }
  1536.                           break;
  1537.                         case -2: case -3:  // timeout ou erreur grave
  1538.                           if (opt.errlog) {
  1539.                             fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Connection too slow for testing link %s%s (from %s%s)"LF,adr,fil,urladr,urlfil);
  1540.                             test_flush;
  1541.                           }
  1542.                           
  1543.                           break;
  1544.                         }
  1545.                         
  1546.                       }
  1547.                     } 
  1548.                   }
  1549. #endif
  1550.                   
  1551.                   // Le lien doit juste Ωtre rΘΘcrit, mais ne doit pas gΘnΘrer un lien
  1552.                   // exemple: <FORM ACTION="url_cgi">
  1553.                   if (p_nocatch) {
  1554.                     forbidden_url=1;    // interdire rΘcupΘration du lien
  1555.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1556.                       fspc(opt.log,"debug"); fprintf(opt.log,"link forced external at %s%s"LF,adr,fil);
  1557.                       test_flush;
  1558.                     }
  1559.                   }
  1560.                   
  1561.                   // Tester si un lien doit Ωtre acceptΘ ou refusΘ (wizard)
  1562.                   // forbidden_url=1 : lien refusΘ
  1563.                   // forbidden_url=0 : lien acceptΘ
  1564.                   //if ((ptr>0) && (p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1565.                   if ((p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1566.                     if (!p_nocatch) {
  1567.                       if (adr[0]!='\0') {          
  1568.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1569.                           fspc(opt.log,"debug"); fprintf(opt.log,"wizard link test at %s%s.."LF,adr,fil);
  1570.                           test_flush;
  1571.                         }
  1572.                         forbidden_url=hts_acceptlink(&opt,ptr,lien_tot,liens,
  1573.                           adr,fil,
  1574.                           filters,&filptr,filter_max,
  1575.                           &robots,
  1576.                           &set_prio_to,
  1577.                           &just_test_it);
  1578.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1579.                           fspc(opt.log,"debug"); fprintf(opt.log,"result for wizard link test: %d"LF,forbidden_url);
  1580.                           test_flush;
  1581.                         }
  1582.                       }
  1583.                     }
  1584.                   }
  1585.                   
  1586.                   // calculer meme_adresse
  1587.                   meme_adresse=strfield2(jump_identification(adr),jump_identification(urladr));
  1588.                   
  1589.                   
  1590.                   
  1591.                   // DΘbut partie sauvegarde
  1592.                   
  1593.                   // ici on forme le nom du fichier α sauver, et on patche l'URL
  1594.                   if (adr[0]!='\0') {
  1595.                     // savename: simplifier les ../ et autres joyeusetΘs
  1596.                     char save[HTS_URLMAXSIZE*2];
  1597.                     int r_sv=0;
  1598.                     // En cas de moved, adresse premiΦre
  1599.                     char former_adr[HTS_URLMAXSIZE*2];
  1600.                     char former_fil[HTS_URLMAXSIZE*2];
  1601.                     //
  1602.                     save[0]='\0'; former_adr[0]='\0'; former_fil[0]='\0';
  1603.                     //
  1604.                     
  1605.                     // nom du chemin α sauver si on doit le calculer
  1606.                     // note: url_savename peut dΘcider de tester le lien si il le trouve
  1607.                     // suspect, et modifier alors adr et fil
  1608.                     // dans ce cas on aura une rΘfΘrence directe au lieu des traditionnels
  1609.                     // moved en cascade (impossible α reproduire α priori en local, lorsque des fichiers
  1610.                     // gif sont impliquΘs par exemple)
  1611.                     if ((p_type!=2) && (p_type!=-2)) {  // pas base href ou codebase
  1612.                       if (forbidden_url!=1) {
  1613.                         char last_adr[HTS_URLMAXSIZE*2];
  1614.                         last_adr[0]='\0';
  1615.                         //char last_fil[HTS_URLMAXSIZE*2]="";
  1616.                         strcpy(last_adr,adr);    // ancienne adresse
  1617.                         //strcpy(last_fil,fil);    // ancien chemin
  1618.                         r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,&opt,liens,lien_tot,back,back_max,&cache,&hash,ptr,numero_passe);
  1619.                         if (strcmp(jump_identification(last_adr),jump_identification(adr)) != 0) {  // a changΘ
  1620.                           
  1621.                           // 2e test si moved
  1622.                           
  1623.                           // Tester si un lien doit Ωtre acceptΘ ou refusΘ (wizard)
  1624.                           // forbidden_url=1 : lien refusΘ
  1625.                           // forbidden_url=0 : lien acceptΘ
  1626.                           if ((ptr>0) && (p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1627.                             if (!p_nocatch) {
  1628.                               if (adr[0]!='\0') {          
  1629.                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1630.                                   fspc(opt.log,"debug"); fprintf(opt.log,"wizard moved link retest at %s%s.."LF,adr,fil);
  1631.                                   test_flush;
  1632.                                 }
  1633.                                 forbidden_url=hts_acceptlink(&opt,ptr,lien_tot,liens,
  1634.                                   adr,fil,
  1635.                                   filters,&filptr,filter_max,
  1636.                                   &robots,
  1637.                                   &set_prio_to,
  1638.                                   &just_test_it);
  1639.                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1640.                                   fspc(opt.log,"debug"); fprintf(opt.log,"result for wizard moved link retest: %d"LF,forbidden_url);
  1641.                                   test_flush;
  1642.                                 }
  1643.                               }
  1644.                             }
  1645.                           }
  1646.                           
  1647.                           //import_done=1;    // c'est un import!
  1648.                           meme_adresse=0;   // on a changΘ
  1649.                         }
  1650.                       } else {
  1651.                         strcpy(save,"");  // dummy
  1652.                       }
  1653.                     }
  1654.                     if (r_sv!=-1) {  // pas d'erreur, on continue
  1655.                       /* log */
  1656.                       if ((opt.debug>1) && (opt.log!=NULL)) {
  1657.                         fspc(opt.log,"debug");
  1658.                         if (forbidden_url!=1) {    // le lien va Ωtre chargΘ
  1659.                           if ((p_type==2) || (p_type==-2)) {  // base href ou codebase, pas un lien
  1660.                             fprintf(opt.log,"Code/Codebase: %s%s"LF,adr,fil);
  1661.                           } else if ((opt.getmode & 4)==0) {
  1662.                             fprintf(opt.log,"Record: %s%s -> %s"LF,adr,fil,save);
  1663.                           } else {
  1664.                             if (!ishtml(fil))
  1665.                               fprintf(opt.log,"Record after: %s%s -> %s"LF,adr,fil,save);
  1666.                             else
  1667.                               fprintf(opt.log,"Record: %s%s -> %s"LF,adr,fil,save);
  1668.                           } 
  1669.                         } else
  1670.                           fprintf(opt.log,"External: %s%s"LF,adr,fil);
  1671.                         test_flush;
  1672.                       }
  1673.                       /* FIN log */
  1674.                       
  1675.                       // Θcrire lien
  1676.                       if ((p_type==2) || (p_type==-2)) {  // base href ou codebase, sauter
  1677.                         lastsaved=eadr-1+1;  // sauter "
  1678.                       }
  1679.                       /* */
  1680.                       else if (opt.urlmode==0) {    // URL absolue dans tous les cas
  1681.                         if ((opt.getmode & 1) && (ptr>0)) {    // ecrire les html
  1682.                           if (!link_has_authority(adr)) {
  1683.                             HT_ADD("http://");
  1684.                           }
  1685.                           if (!opt.passprivacy) {
  1686.                             HT_ADD(adr);     // Password
  1687.                           } else {
  1688.                             HT_ADD(jump_identification(adr));     // No Password
  1689.                           }
  1690.                           if (*fil!='/')
  1691.                             HT_ADD("/");
  1692.                           HT_ADD(fil);
  1693.                         }
  1694.                         lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  1695.                       /* */
  1696.                       } else if (opt.urlmode >= 4) {    // ne rien faire dans tous les cas!
  1697.                       /* */
  1698.                       /* leave the link 'as is' */
  1699.                       /* Sinon, dΘpend de interne/externe */
  1700.                       } else if (forbidden_url==1) {    // le lien ne sera pas chargΘ, rΘfΘrence externe!
  1701.                         if ((opt.getmode & 1) && (ptr>0)) {
  1702.                           if (p_type!=-1) {     // pas que le nom de fichier (pas classe java)
  1703.                             if (!opt.external) {
  1704.                               if (!link_has_authority(adr)) {
  1705.                                 HT_ADD("http://");
  1706.                                 if (!opt.passprivacy) {
  1707.                                   HT_ADD(adr);     // Password
  1708.                                 } else {
  1709.                                   HT_ADD(jump_identification(adr));     // No Password
  1710.                                 }
  1711.                                 if (*fil!='/')
  1712.                                   HT_ADD("/");
  1713.                                 HT_ADD(fil);
  1714.                               } else {
  1715.                                 char* aut = strstr(adr, "//");
  1716.                                 if (aut) {
  1717.                                   char tmp[256];
  1718.                                   tmp[0]='\0';
  1719.                                   strncat(tmp, adr, (int) (aut - adr));   // scheme
  1720.                                   if (!opt.passprivacy) {
  1721.                                     HT_ADD(jump_protocol(adr));          // Password
  1722.                                   } else {
  1723.                                     HT_ADD(jump_identification(adr));     // No Password
  1724.                                   }
  1725.                                   if (*fil!='/')
  1726.                                     HT_ADD("/");
  1727.                                   HT_ADD(fil);
  1728.                                 }
  1729.                               }
  1730.                               //
  1731.                             } else {    // fichier/page externe, mais on veut gΘnΘrer une erreur
  1732.                               //
  1733.                               int patch_it=0;
  1734.                               int add_url=0;
  1735.                               char* cat_name=NULL;
  1736.                               char* cat_data=NULL;
  1737.                               int cat_nb=0;
  1738.                               int cat_data_len=0;
  1739.                               
  1740.                               // ajouter lien external
  1741.                               switch ( (link_has_authority(adr)) ? 1 : ( (fil[strlen(fil)-1]=='/')?1:(ishtml(fil))  ) ) {
  1742.                               case 1: case -2:       // html ou rΘpertoire
  1743.                                 if (opt.getmode & 1) {  // sauver html
  1744.                                   patch_it=1;   // redirect
  1745.                                   add_url=1;    // avec link?
  1746.                                   cat_name="external.html";
  1747.                                   cat_nb=0;
  1748.                                   cat_data=HTS_DATA_UNKNOWN_HTML;
  1749.                                   cat_data_len=HTS_DATA_UNKNOWN_HTML_LEN;
  1750.                                 }
  1751.                                 break;
  1752.                               default:    // inconnu
  1753.                                 // asp, cgi..
  1754.                                 if (is_dyntype(get_ext(fil))) {
  1755.                                   patch_it=1;   // redirect
  1756.                                   add_url=1;    // avec link?
  1757.                                   cat_name="external.html";
  1758.                                   cat_nb=0;
  1759.                                   cat_data=HTS_DATA_UNKNOWN_HTML;
  1760.                                   cat_data_len=HTS_DATA_UNKNOWN_HTML_LEN;
  1761.                                 } else if ( (strfield2(fil+max(0,(int)strlen(fil)-4),".gif")) 
  1762.                                   || (strfield2(fil+max(0,(int)strlen(fil)-4),".jpg")) 
  1763.                                   || (strfield2(fil+max(0,(int)strlen(fil)-4),".xbm")) 
  1764.                                   || (ishtml(fil)!=0) ) {
  1765.                                   patch_it=1;   // redirect
  1766.                                   add_url=1;    // avec link aussi
  1767.                                   cat_name="external.gif";
  1768.                                   cat_nb=1;
  1769.                                   cat_data=HTS_DATA_UNKNOWN_GIF;
  1770.                                   cat_data_len=HTS_DATA_UNKNOWN_GIF_LEN;
  1771.                                 }
  1772.                                 break;
  1773.                               }// html,gif
  1774.                               
  1775.                               if (patch_it) {
  1776.                                 char save[HTS_URLMAXSIZE*2];
  1777.                                 char tempo[HTS_URLMAXSIZE*2];
  1778.                                 strcpy(save,opt.path_html);
  1779.                                 strcat(save,cat_name);
  1780.                                 if (lienrelatif(tempo,save,savename)==0) {
  1781.                                   if (!no_esc_utf)
  1782.                                     escape_uri(tempo);     // escape with %xx
  1783.                                   else
  1784.                                     escape_uri_utf(tempo);     // escape with %xx
  1785.                                   HT_ADD(tempo);    // page externe
  1786.                                   if (add_url) {
  1787.                                     HT_ADD("?link=");    // page externe
  1788.                                     
  1789.                                     // same as above
  1790.                                     if (!link_has_authority(adr)) {
  1791.                                       HT_ADD("http://");
  1792.                                       if (!opt.passprivacy) {
  1793.                                         HT_ADD(adr);     // Password
  1794.                                       } else {
  1795.                                         HT_ADD(jump_identification(adr));     // No Password
  1796.                                       }
  1797.                                       if (*fil!='/')
  1798.                                         HT_ADD("/");
  1799.                                       HT_ADD(fil);
  1800.                                     } else {
  1801.                                       char* aut = strstr(adr, "//");
  1802.                                       if (aut) {
  1803.                                         char tmp[256];
  1804.                                         tmp[0]='\0';
  1805.                                         strncat(tmp, adr, (int) (aut - adr) + 2);   // scheme
  1806.                                         HT_ADD(tmp);
  1807.                                         if (!opt.passprivacy) {
  1808.                                           HT_ADD(jump_protocol(adr));          // Password
  1809.                                         } else {
  1810.                                           HT_ADD(jump_identification(adr));     // No Password
  1811.                                         }
  1812.                                         if (*fil!='/')
  1813.                                           HT_ADD("/");
  1814.                                         HT_ADD(fil);
  1815.                                       }
  1816.                                     }
  1817.                                     //
  1818.  
  1819.                                   }
  1820.                                 }
  1821.                                 
  1822.                                 // Θcrire fichier?
  1823.                                 if (verif_external(cat_nb,1)) {
  1824.                                 //if (!fexist(fconcat(opt.path_html,cat_name))) {
  1825.                                   FILE* fp = filecreate(fconcat(opt.path_html,cat_name));
  1826.                                   if (fp) {
  1827.                                     if (cat_data_len==0) {   // texte
  1828.                                       verif_backblue(opt.path_html);
  1829.                                       fprintf(fp,"%s%s","<!-- Created by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"LF,cat_data);
  1830.                                     } else {                    // data
  1831.                                       fwrite(cat_data,cat_data_len,1,fp);
  1832.                                     }
  1833.                                     fclose(fp);
  1834.                                     usercommand(0,NULL,fconcat(opt.path_html,cat_name));
  1835.                                   }
  1836.                                 }
  1837.                               }  else {    // Θcrire normalement le nom de fichier
  1838.                                 HT_ADD("http://");
  1839.                                 if (!opt.passprivacy) {
  1840.                                   HT_ADD(adr);       // Password
  1841.                                 } else {
  1842.                                   HT_ADD(jump_identification(adr));       // No Password
  1843.                                 }
  1844.                                 if (*fil!='/')
  1845.                                   HT_ADD("/");
  1846.                                 HT_ADD(fil);
  1847.                               }// patcher?
  1848.                             }  // external
  1849.                           } else {  // que le nom de fichier (classe java)
  1850.                             // en gros recopie de plus bas: copier codebase et base
  1851.                             if (p_flush) {
  1852.                               char tempo[HTS_URLMAXSIZE*2];    // <-- ajoutΘ
  1853.                               char tempo_pat[HTS_URLMAXSIZE*2];
  1854.  
  1855.                               // Calculer chemin
  1856.                               tempo_pat[0]='\0';
  1857.                               strcpy(tempo,fil);  // <-- ajoutΘ
  1858.                               {
  1859.                                 char* a=strrchr(tempo,'/');
  1860.  
  1861.                                 // Example: we converted code="x.y.z.foo.class" into "x/y/z/foo.class"
  1862.                                 // we have to do the contrary now
  1863.                                 if (add_class_dots_to_patch>0) {
  1864.                                   while( (add_class_dots_to_patch>0) && (a) ) {
  1865.                                     *a='.';     // convert "false" java / into .
  1866.                                     add_class_dots_to_patch--;
  1867.                                     a=strrchr(tempo,'/');
  1868.                                   }
  1869.                                   // if add_class_dots_to_patch, this is because there is a problem!!
  1870.                                   if (add_class_dots_to_patch) {
  1871.                                     if (opt.errlog) {
  1872.                                       fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Error: can not rewind java path %s, check html code"LF,tempo);
  1873.                                       test_flush;
  1874.                                     }
  1875.                                   }
  1876.                                 }
  1877.  
  1878.                                 // Cut path/filename
  1879.                                 if (a) {
  1880.                                   char tempo2[HTS_URLMAXSIZE*2];
  1881.                                   strcpy(tempo2,a+1);         // FICHIER
  1882.                                   strncat(tempo_pat,tempo,(int) (a - tempo)+1);  // chemin
  1883.                                   strcpy(tempo,tempo2);                     // fichier
  1884.                                 }
  1885.                               }
  1886.                               
  1887.                               // Θrire codebase="chemin"
  1888.                               if ((opt.getmode & 1) && (ptr>0)) {
  1889.                                 char tempo4[HTS_URLMAXSIZE*2];
  1890.                                 tempo4[0]='\0';
  1891.                                 
  1892.                                 if (strnotempty(tempo_pat)) {
  1893.                                   HT_ADD("codebase=\"http://");
  1894.                                   if (!opt.passprivacy) {
  1895.                                     HT_ADD(adr);  // Password
  1896.                                   } else {
  1897.                                     HT_ADD(jump_identification(adr));  // No Password
  1898.                                   }
  1899.                                   if (*tempo_pat!='/') HT_ADD("/");
  1900.                                   HT_ADD(tempo_pat);
  1901.                                   HT_ADD("\" ");
  1902.                                 }
  1903.                                 
  1904.                                 strncat(tempo4,lastsaved,(int) (p_flush - lastsaved));
  1905.                                 HT_ADD(tempo4);    // refresh code="
  1906.                                 HT_ADD(tempo);
  1907.                               }
  1908.                             }
  1909.                           }
  1910.                         }
  1911.                         lastsaved=eadr-1;
  1912.                       }
  1913.                       /*
  1914.                       else if (opt.urlmode==1) {    // ABSOLU, c'est le cas le moins courant
  1915.                       //  NE FONCTIONNE PAS!!  (et est inutile)
  1916.                       if ((opt.getmode & 1) && (ptr>0)) {    // ecrire les html
  1917.                       // Θcrire le lien modifiΘ, absolu
  1918.                       HT_ADD("file:");
  1919.                       if (*save=='/')
  1920.                       HT_ADD(save+1)
  1921.                       else
  1922.                       HT_ADD(save)
  1923.                       }
  1924.                       lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  1925.                       }
  1926.                       */
  1927.                       else if (opt.urlmode==3) {    // URI absolue /
  1928.                         if ((opt.getmode & 1) && (ptr>0)) {    // ecrire les html
  1929.                           HT_ADD(fil);
  1930.                         }
  1931.                         lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  1932.                       }
  1933.                       else if (opt.urlmode==2) {  // RELATIF
  1934.                         char tempo[HTS_URLMAXSIZE*2];
  1935.                         tempo[0]='\0';
  1936.                         // calculer le lien relatif
  1937.                         
  1938.                         if (lienrelatif(tempo,save,savename)==0) {
  1939.                           if (!no_esc_utf)
  1940.                             escape_uri(tempo);     // escape with %xx
  1941.                           else
  1942.                             escape_uri_utf(tempo);     // escape with %xx
  1943.                           if ((opt.debug>1) && (opt.log!=NULL)) {
  1944.                             fspc(opt.log,"debug"); fprintf(opt.log,"relative link at %s build with %s and %s: %s"LF,adr,save,savename,tempo);
  1945.                             test_flush;
  1946.                           }
  1947.                           
  1948.                           // lien applet (code) - il faut placer un codebase avant
  1949.                           if (p_type==-1) {  // que le nom de fichier
  1950.                             
  1951.                             if (p_flush) {
  1952.                               char tempo_pat[HTS_URLMAXSIZE*2];
  1953.                               tempo_pat[0]='\0';
  1954.                               {
  1955.                                 char* a=strrchr(tempo,'/');
  1956.  
  1957.                                 // Example: we converted code="x.y.z.foo.class" into "x/y/z/foo.class"
  1958.                                 // we have to do the contrary now
  1959.                                 if (add_class_dots_to_patch>0) {
  1960.                                   while( (add_class_dots_to_patch>0) && (a) ) {
  1961.                                     *a='.';     // convert "false" java / into .
  1962.                                     add_class_dots_to_patch--;
  1963.                                     a=strrchr(tempo,'/');
  1964.                                   }
  1965.                                   // if add_class_dots_to_patch, this is because there is a problem!!
  1966.                                   if (add_class_dots_to_patch) {
  1967.                                     if (opt.errlog) {
  1968.                                       fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Error: can not rewind java path %s, check html code"LF,tempo);
  1969.                                       test_flush;
  1970.                                     }
  1971.                                   }
  1972.                                 }
  1973.  
  1974.                                 if (a) {
  1975.                                   char tempo2[HTS_URLMAXSIZE*2];
  1976.                                   strcpy(tempo2,a+1);
  1977.                                   strncat(tempo_pat,tempo,(int) (a - tempo)+1);  // chemin
  1978.                                   strcpy(tempo,tempo2);                     // fichier
  1979.                                 }
  1980.                               }
  1981.                               
  1982.                               // Θrire codebase="chemin"
  1983.                               if ((opt.getmode & 1) && (ptr>0)) {
  1984.                                 char tempo4[HTS_URLMAXSIZE*2];
  1985.                                 tempo4[0]='\0';
  1986.                                 
  1987.                                 if (strnotempty(tempo_pat)) {
  1988.                                   HT_ADD("codebase=\"");
  1989.                                   HT_ADD(tempo_pat);
  1990.                                   HT_ADD("\" ");
  1991.                                 }
  1992.                                 
  1993.                                 strncat(tempo4,lastsaved,(int) (p_flush - lastsaved));
  1994.                                 HT_ADD(tempo4);    // refresh code="
  1995.                               }
  1996.                             }
  1997.                             //lastsaved=adr;    // dernier Θcrit+1
  1998.                           }                              
  1999.                           
  2000.                           if ((opt.getmode & 1) && (ptr>0)) {
  2001.                             // Θcrire le lien modifiΘ, relatif
  2002.                             HT_ADD(tempo);
  2003.  
  2004.                             // Add query-string, for informational purpose only
  2005.                             // Useless, because all parameters-pages are saved into different targets
  2006.                             if (opt.includequery) {
  2007.                               char* a=strchr(lien,'?');
  2008.                               if (a) {
  2009.                                 HT_ADD(a);
  2010.                               }
  2011.                             }
  2012.                           }
  2013.                           lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  2014.                         } else {
  2015.                           if (opt.errlog) {
  2016.                             fprintf(opt.errlog,"Error building relative link %s and %s"LF,save,savename);
  2017.                             test_flush;
  2018.                           }
  2019.                         }
  2020.                       }  // sinon le lien sera Θcrit normalement
  2021.                       
  2022.                       
  2023. #if 0
  2024.                       if (fexist(save)) {    // le fichier existe..
  2025.                         adr[0]='\0';
  2026.                         //if ((opt.debug>0) && (opt.log!=NULL)) {
  2027.                         if (opt.errlog) {
  2028.                           fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link has already been written on disk, cancelled: %s"LF,save);
  2029.                           test_flush;
  2030.                         }
  2031.                       }
  2032. #endif                            
  2033.                       
  2034.                       /* Security check */
  2035.                       if (strlen(save) >= HTS_URLMAXSIZE) {
  2036.                         adr[0]='\0';
  2037.                         if (opt.errlog) {
  2038.                           fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link is too long: %s"LF,save);
  2039.                           test_flush;
  2040.                         }
  2041.                       }
  2042.  
  2043.                       if ((adr[0]!='\0') && (p_type!=2) && (p_type!=-2) && ( (forbidden_url!=1) || (just_test_it))) {  // si le fichier n'existe pas, ajouter α la liste                            
  2044.                         // n'y a-t-il pas trop de liens?
  2045.                         if (lien_tot+1 >= lien_max-4) {    // trop de liens!
  2046.                           printf("PANIC! : Too many URLs : >%d [%d]\n",lien_tot,__LINE__);
  2047.                           if (opt.errlog) {
  2048.                             fprintf(opt.errlog,LF"Too many URLs, giving up..(>%d)"LF,lien_max);
  2049.                             fprintf(opt.errlog,"To avoid that: use #L option for more links (example: -#L1000000)"LF);
  2050.                             test_flush;
  2051.                           }
  2052.                           if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2053.                           XH_uninit;   // dΘsallocation mΘmoire & buffers
  2054.                           return 0;
  2055.                           
  2056.                         } else {    // noter le lien sur la listes des liens α charger
  2057.                           int pass_fix,dejafait=0;
  2058.                           
  2059.                           // Calculer la prioritΘ de ce lien
  2060.                           if ((opt.getmode & 4)==0) {    // traiter html aprΦs
  2061.                             pass_fix=0;
  2062.                           } else {    // vΘrifier que ce n'est pas un !html
  2063.                             if (!ishtml(fil))
  2064.                               pass_fix=1;        // prioritΘ infΘrieure (traiter aprΦs)
  2065.                             else
  2066.                               pass_fix=max(0,numero_passe);    // prioritΘ normale
  2067.                           }
  2068.                           
  2069.                           /* If the file seems to be an html file, get depth-1 */
  2070.                           /*
  2071.                           if (strnotempty(save)) {
  2072.                             if (ishtml(save) == 1) {
  2073.                               // descore_prio = 2;
  2074.                             } else {
  2075.                               // descore_prio = 1;
  2076.                             }
  2077.                           }
  2078.                           */
  2079.                           
  2080.                           // vΘrifier que le lien n'a pas dΘja ΘtΘ notΘ
  2081.                           // si c'est le cas, alors il faut s'assurer que la prioritΘ associΘe
  2082.                           // au fichier est la plus grande des deux prioritΘs
  2083.                           //
  2084.                           // On part de la fin et on essaye de se presser (Θconomise temps machine)
  2085. #if HTS_HASH
  2086.                           {
  2087.                             int i=hash_read(&hash,save,"",0);      // lecture type 0 (sav)
  2088.                             if (i>=0) {
  2089.                               liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth - 1);
  2090.                               dejafait=1;
  2091.                             }
  2092.                           }
  2093. #else
  2094.                           {
  2095.                             int l;
  2096.                             int i;
  2097.                             l=strlen(save);  // opti
  2098.                             for(i=lien_tot-1;(i>=0) && (dejafait==0);i--) {
  2099.                               if (liens[i]->sav_len==l) {    // mΩme taille de chaεne
  2100.                                 if (strcmp(liens[i]->sav,save)==0) {    // existe dΘja
  2101.                                   liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth - 1);
  2102.                                   dejafait=1;
  2103.                                 }
  2104.                               }
  2105.                             }
  2106.                           }
  2107. #endif
  2108.                           
  2109.                           // le lien n'a jamais ΘtΘ crΘΘ.
  2110.                           // cette fois ci, on le crΘe!
  2111.                           if (!dejafait) {                                
  2112.                             //
  2113.                             // >>>> CREER LE LIEN <<<<
  2114.                             //
  2115.                             // enregistrer lien α charger
  2116.                             //liens[lien_tot]->adr[0]=liens[lien_tot]->fil[0]=liens[lien_tot]->sav[0]='\0';
  2117.                             // mΩme adresse: l'objet pΦre est l'objet pΦre de l'actuel
  2118.                             
  2119.                             // DEBUT ROBOTS.TXT AJOUT
  2120.                             if (!just_test_it) {
  2121.                               if (
  2122.                                 (!strfield(adr,"ftp://"))         // non ftp
  2123.                              && (!strfield(adr,"file://")) ) {    // non file
  2124.                                 if (opt.robots) {    // rΘcupΘrer robots
  2125.                                   if (ishtml(fil)!=0) {                       // pas la peine pour des fichiers isolΘs
  2126.                                     if (checkrobots(&robots,adr,"") != -1) {    // robots.txt ?
  2127.                                       checkrobots_set(&robots,adr,"");          // ajouter entrΘe vide
  2128.                                       if (checkrobots(&robots,adr,"") == -1) {    // robots.txt ?
  2129.                                         // enregistrer robots.txt (MACRO)
  2130.                                         liens_record(adr,"/robots.txt","","","");
  2131.                                         if (liens[lien_tot]==NULL) {  // erreur, pas de place rΘservΘe
  2132.                                           printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  2133.                                           if (opt.errlog) { 
  2134.                                             fprintf(opt.errlog,"Not enough memory, can not re-allocate %d bytes"LF,(int)((add_tab_alloc+1)*sizeof(lien_url)));
  2135.                                             test_flush;
  2136.                                           }
  2137.                                           if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2138.                                           XH_uninit;    // dΘsallocation mΘmoire & buffers
  2139.                                           return 0;
  2140.                                         }  
  2141.                                         liens[lien_tot]->testmode=0;          // pas mode test
  2142.                                         liens[lien_tot]->link_import=0;       // pas mode import     
  2143.                                         liens[lien_tot]->premier=lien_tot;
  2144.                                         liens[lien_tot]->precedent=ptr;
  2145.                                         liens[lien_tot]->depth=0;
  2146.                                         liens[lien_tot]->pass2=max(0,numero_passe);
  2147.                                         liens[lien_tot]->retry=0;
  2148.                                         lien_tot++;  // UN LIEN DE PLUS
  2149. #if DEBUG_ROBOTS
  2150.                                         printf("robots.txt: added file robots.txt for %s\n",adr);
  2151. #endif
  2152.                                         if ((opt.debug>1) && (opt.log!=NULL)) {
  2153.                                           fspc(opt.log,"debug"); fprintf(opt.log,"robots.txt added at %s"LF,adr);
  2154.                                           test_flush;
  2155.                                         }
  2156.                                       } else {
  2157.                                         if (opt.errlog) {   
  2158.                                           fprintf(opt.errlog,"Unexpected robots.txt error at %d"LF,__LINE__);
  2159.                                           test_flush;
  2160.                                         }
  2161.                                       }
  2162.                                     }
  2163.                                   }
  2164.                                 }
  2165.                               }
  2166.                             }
  2167.                             // FIN ROBOTS.TXT AJOUT
  2168.                             
  2169.                             // enregistrer (MACRO)
  2170.                             liens_record(adr,fil,save,former_adr,former_fil);
  2171.                             if (liens[lien_tot]==NULL) {  // erreur, pas de place rΘservΘe
  2172.                               printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  2173.                               if (opt.errlog) { 
  2174.                                 fprintf(opt.errlog,"Not enough memory, can not re-allocate %d bytes"LF,(int)((add_tab_alloc+1)*sizeof(lien_url)));
  2175.                                 test_flush;
  2176.                               }
  2177.                               if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  2178.                               XH_uninit;    // dΘsallocation mΘmoire & buffers
  2179.                               return 0;
  2180.                             }  
  2181.                             
  2182.                             // mode test?
  2183.                             if (!just_test_it)
  2184.                               liens[lien_tot]->testmode=0;          // pas mode test
  2185.                             else
  2186.                               liens[lien_tot]->testmode=1;          // mode test
  2187.                             if (!import_done)
  2188.                               liens[lien_tot]->link_import=0;       // pas mode import
  2189.                             else
  2190.                               liens[lien_tot]->link_import=1;       // mode import
  2191.                             // Θcrire autres paramΦtres de la structure-lien
  2192.                             if ((meme_adresse) && (!import_done) && (liens[ptr]->premier != 0))
  2193.                               liens[lien_tot]->premier=liens[ptr]->premier;
  2194.                             else    // sinon l'objet pΦre est le prΘcΘdent lui mΩme
  2195.                               liens[lien_tot]->premier=lien_tot;
  2196.                             // liens[lien_tot]->premier=ptr;
  2197.                             
  2198.                             liens[lien_tot]->precedent=ptr;
  2199.                             // noter la prioritΘ
  2200.                             if (!set_prio_to)
  2201.                               liens[lien_tot]->depth=liens[ptr]->depth - 1;
  2202.                             else
  2203.                               liens[lien_tot]->depth=max(0,min(liens[ptr]->depth-1,set_prio_to-1));         // PRIORITE NULLE (catch page)
  2204.                             // noter pass
  2205.                             liens[lien_tot]->pass2=pass_fix;
  2206.                             liens[lien_tot]->retry=opt.retry;
  2207.                             
  2208.                             //strcpy(liens[lien_tot]->adr,adr);
  2209.                             //strcpy(liens[lien_tot]->fil,fil);
  2210.                             //strcpy(liens[lien_tot]->sav,save); 
  2211.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  2212.                               if (!just_test_it) {
  2213.                                 fspc(opt.log,"debug"); fprintf(opt.log,"OK, NOTE: %s%s -> %s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil,liens[lien_tot]->sav);
  2214.                               } else {
  2215.                                 fspc(opt.log,"debug"); fprintf(opt.log,"OK, TEST: %s%s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil);
  2216.                               }
  2217.                               test_flush;
  2218.                             }
  2219.                             
  2220.                             lien_tot++;  // UN LIEN DE PLUS
  2221.                           } else { // if !dejafait
  2222.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  2223.                               fspc(opt.log,"debug"); fprintf(opt.log,"link has already been recorded, cancelled: %s"LF,save);
  2224.                               test_flush;
  2225.                             }
  2226.                             
  2227.                           }
  2228.                           
  2229.                           
  2230.                         }   // si pas trop de liens
  2231.                       }   // si adr[0]!='\0'
  2232.                       
  2233.                       
  2234.                     }  // if adr[0]!='\0' 
  2235.                     
  2236.                   }  // if adr[0]!='\0'
  2237.                   
  2238.                 }    // if strlen(lien)>0
  2239.                 
  2240.               }   // if ok==0      
  2241.               
  2242.               adr=eadr-1;  // ** sauter
  2243.               
  2244.             }  // if (p) 
  2245.             
  2246.           }  // si '<' ou '>'
  2247.           
  2248.           // plus loin
  2249.           adr++;
  2250.  
  2251.  
  2252.           /* Otimization: if we are scanning in HTML data (not in tag or script), 
  2253.           then jump to the next starting tag */
  2254.           if (ptr>0) {
  2255.             if ( (!intag)         /* Not in tag */
  2256.               && (!inscript)      /* Not in (java)script */
  2257.               && (!incomment)     /* Not in comment (<!--) */
  2258.               && (!inscript_tag)  /* Not in tag with script inside */
  2259.               ) 
  2260.             {
  2261.               /* Not at the end */
  2262.               if (( ((int) (adr - r.adr)) ) < r.size) {
  2263.                 /* Not on a starting tag yet */
  2264.                 if (*adr != '<') {
  2265.                   char* adr_next = strchr(adr,'<');
  2266.                   /* Jump to near end (index hack) */
  2267.                   if (!adr_next) {
  2268.                     if (
  2269.                       ( (int)(adr - r.adr) < (r.size - 4)) 
  2270.                       &&
  2271.                       (r.size > 4)
  2272.                       ) {
  2273.                       adr = r.adr + r.size - 2;
  2274.                     }
  2275.                   } else {
  2276.                     adr = adr_next;
  2277.                   }
  2278.                 }
  2279.               }
  2280.             }
  2281.           }
  2282.           
  2283.           // ----------
  2284.           // Θcrire peu α peu
  2285.           if ((opt.getmode & 1) && (ptr>0)) HT_ADD_ADR;
  2286.           lastsaved=adr;    // dernier Θcrit+1
  2287.           // ----------
  2288.           
  2289.           // pour les stats du shell si parsing trop long
  2290. #if HTS_ANALYSTE
  2291.           if (r.size)
  2292.             _hts_in_html_done=(100 * ((int) (adr - r.adr)) ) / (int)(r.size);
  2293.           if (_hts_in_html_poll) {
  2294.             _hts_in_html_poll=0;
  2295.             // temps α attendre, et remplir autant que l'on peut le cache (backing)
  2296.             back_wait(back,back_max,&opt,&cache,HTS_STAT.stat_timestart);        
  2297.             back_fillmax(back,back_max,&opt,&cache,liens,ptr,numero_passe,lien_tot);
  2298.  
  2299.             // Transfer rate
  2300.             engine_stats();
  2301.             
  2302.             // Refresh various stats
  2303.             HTS_STAT.stat_nsocket=back_nsoc(back,back_max);
  2304.             HTS_STAT.stat_errors=fspc(NULL,"error");
  2305.             HTS_STAT.stat_warnings=fspc(NULL,"warning");
  2306.             HTS_STAT.stat_infos=fspc(NULL,"info");
  2307.             HTS_STAT.nbk=backlinks_done(liens,lien_tot,ptr);
  2308.             HTS_STAT.nb=back_transfered(HTS_STAT.stat_bytes,back,back_max);
  2309.  
  2310.             if (!hts_htmlcheck_loop(back,back_max,0,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) {
  2311.               if (opt.errlog) {
  2312.                 fspc(opt.errlog,"info"); fprintf(opt.errlog,"Exit requested by shell or user"LF);
  2313.                 test_flush;
  2314.               } 
  2315.               exit_xh=1;  // exit requested
  2316.               XH_uninit;
  2317.               return 0;
  2318.               //adr = r.adr + r.size;  // exit
  2319.             } else if (_hts_cancel==1) {
  2320.               // adr = r.adr + r.size;  // exit
  2321.               nofollow=1;               // moins violent
  2322.               _hts_cancel=0;
  2323.             }
  2324.           }
  2325.  
  2326.           // refresh the backing system each 2 seconds
  2327.           if (engine_stats()) {
  2328.             back_wait(back,back_max,&opt,&cache,HTS_STAT.stat_timestart);        
  2329.             back_fillmax(back,back_max,&opt,&cache,liens,ptr,numero_passe,lien_tot);
  2330.           }
  2331. #endif
  2332.         } while(( ((int) (adr - r.adr)) ) < r.size);
  2333. #if HTS_ANALYSTE
  2334.         _hts_in_html_parsing=0;  // flag
  2335.         _hts_cancel=0;           // pas de cancel
  2336. #endif
  2337.         if ((opt.getmode & 1) && (ptr>0)) {
  2338.           HT_ADD_END;    // achever
  2339.         }
  2340.         //
  2341.         //
  2342.         //
  2343.       }  // if !error
  2344.       
  2345.       
  2346.       if (opt.getmode & 1) { if (fp) { fclose(fp); fp=NULL; } }
  2347.       // sauver fichier
  2348.       //structcheck(savename);
  2349.       //filesave(r.adr,r.size,savename);
  2350.       
  2351. #if HTS_ANALYSTE
  2352.     }  // analyse OK
  2353. #endif
  2354.         
  2355.